""" TurkTokenizer — production-ready Turkish morphological tokenizer. Applies 12 sequential fixes on top of the base turkish-tokenizer: 1. ALL CAPS inflation fix 2. Apostrophe / code-switching split 3. BPE→SUFFIX reclassification 4. Zemberek root validation & correction 5. Punctuation → PUNCT type 6. Domain vocabulary (medical / sports / tourism) 7. TDK-based FOREIGN word detection 8. Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI) 9. Allomorph canonicalization 10. Compound word decomposition 11. Acronym expansion 12. Context-aware Zemberek disambiguation Output fields per token: token : str — token string (leading space = word-initial) token_type : str — ROOT | SUFFIX | FOREIGN | BPE | PUNCT | NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second suffix… (+ optional _* metadata fields) """ from __future__ import annotations import os import multiprocessing from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path from ._java_check import ensure_java from ._preprocessor import preprocess, postprocess from ._suffix_expander import reclassify_bpe_suffixes from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE from ._medical_vocab import ALL_DOMAIN_ROOTS from ._tdk_vocab import reclassify_foreign_words from ._normalizer import ( preprocess_special_tokens, restore_special_tokens, reclassify_numbers_in_tokens, ) from ._allomorph import add_canonical_labels from ._compound import add_compound_info from ._acronym_dict import reclassify_acronyms from ._context_aware import annotate_with_context try: from ._root_validator import _morphology as _zemb_morphology except Exception: _zemb_morphology = None _DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS} # ── Token types ─────────────────────────────────────────────────────────────── _SPECIAL_TYPES = frozenset( ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI") ) _TYPE_SYM = { "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P", "NUM": "N", "DATE": "D", "UNIT": "U", "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E", } # ── Parallel worker helpers ─────────────────────────────────────────────────── _worker_tok: "TurkTokenizer | None" = None def _init_worker() -> None: global _worker_tok _worker_tok = TurkTokenizer() def _tokenize_one(text: str) -> list[dict]: assert _worker_tok is not None return _worker_tok.tokenize(text) # ══════════════════════════════════════════════════════════════════════════════ class TurkTokenizer: """ Turkish morphological tokenizer with HuggingFace-compatible interface. Example:: from turk_tokenizer import TurkTokenizer tok = TurkTokenizer() tokens = tok("İstanbul'da meeting'e katılamadım") for t in tokens: print(t["token"], t["token_type"], t["morph_pos"]) """ def __init__(self) -> None: ensure_java() from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415 self._base = TurkishTokenizer() self.zemberek_available = ZEMBEREK_AVAILABLE # ── Public API ──────────────────────────────────────────────────────────── def __call__(self, text: str) -> list[dict]: return self.tokenize(text) def tokenize(self, text: str) -> list[dict]: """Tokenize a single text string. Returns a list of token dicts, each with: ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields. """ # Fix 8 pre: replace URLs, mentions, numbers etc. with placeholders text_norm, specials = preprocess_special_tokens(text) # Fix 1 & 2 pre: ALL CAPS + apostrophe processed, caps_map = preprocess(text_norm) # Base tokenizer raw = self._base.tokenize_text(processed) # Fix 8 post: restore placeholders tokens = restore_special_tokens(raw, specials) # Fix 1 & 2 post tokens = postprocess(tokens, caps_map) # Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT tokens = reclassify_bpe_suffixes(tokens) # Fix 8b: remaining numbers / units tokens = reclassify_numbers_in_tokens(tokens) # Fix 6: domain vocabulary (medical / sports / tourism) tokens = _reclassify_domain_roots(tokens, _DOMAIN_ROOTS_LOWER) # Fix 7: TDK FOREIGN detection tokens = reclassify_foreign_words(tokens) # Fix 11: acronym expansions tokens = reclassify_acronyms(tokens) # Fix 9: allomorph canonical labels tokens = add_canonical_labels(tokens) # Fix 10: compound word annotation tokens = add_compound_info(tokens, morphology=_zemb_morphology) # Fix 12: context-aware Zemberek disambiguation tokens = annotate_with_context(tokens, text) # Fix 4: Zemberek root validation & correction tokens = validate_roots(tokens, text.split(), base_tokenizer=self._base) # Add public output fields tokens = _add_output_fields(tokens) return tokens def batch_tokenize( self, texts: list[str], workers: int | None = None, chunk_size: int = 64, ) -> list[list[dict]]: """Tokenize a list of texts in parallel. Args: texts: List of strings to tokenize. workers: Number of worker processes (None = all CPUs). chunk_size: Below this count, run sequentially to avoid overhead. Returns: List of token lists, in the same order as ``texts``. """ if not texts: return [] n = workers or os.cpu_count() or 4 if len(texts) <= chunk_size or n == 1: return [self.tokenize(t) for t in texts] results: list[list[dict] | None] = [None] * len(texts) with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool: futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)} for fut in as_completed(futs): i = futs[fut] try: results[i] = fut.result() except Exception as exc: # noqa: BLE001 results[i] = self._base.tokenize_text(texts[i]) print(f"[TurkTokenizer] fallback at idx={i}: {exc}") return results # type: ignore[return-value] # ── HuggingFace-style helpers ───────────────────────────────────────────── @classmethod def from_pretrained(cls, _model_id: str = "Ethosoft/turk-tokenizer") -> "TurkTokenizer": """Load tokenizer (rules-based, no weights to download).""" return cls() def save_pretrained(self, save_directory: str) -> None: """Save tokenizer config to a directory (for HF Hub compatibility).""" import json path = Path(save_directory) path.mkdir(parents=True, exist_ok=True) config = { "tokenizer_class": "TurkTokenizer", "model_type": "turk-tokenizer", "version": "1.0.0", "zemberek_available": self.zemberek_available, } (path / "tokenizer_config.json").write_text( json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8" ) # ── Utility ─────────────────────────────────────────────────────────────── def stats(self, tokens: list[dict]) -> dict: """Compute morphological coverage statistics for a token list.""" total = len(tokens) if total == 0: return {k: 0 for k in ("total", "roots", "suffixes", "foreign", "bpe", "punct", "special", "tr_pct", "pure_pct")} roots = sum(1 for t in tokens if t["token_type"] == "ROOT") suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX") foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN") punct = sum(1 for t in tokens if t["token_type"] == "PUNCT") bpe = sum(1 for t in tokens if t["token_type"] == "BPE") special = sum(1 for t in tokens if t["token_type"] in _SPECIAL_TYPES) tr = roots + suffixes + foreign + punct + special pure = sum( 1 for t in tokens if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN") and not t["token"].strip().startswith("<") ) return { "total": total, "roots": roots, "suffixes": suffixes, "foreign": foreign, "bpe": bpe, "punct": punct, "special": special, "tr_pct": round(tr / total * 100, 2), "pure_pct": round(pure / total * 100, 2), } # ── Internal helpers ────────────────────────────────────────────────────────── def _reclassify_domain_roots(tokens: list[dict], domain_lower: set) -> list[dict]: result = [] for tok in tokens: if tok["type"] != "BPE": result.append(tok) continue raw = tok["token"] if raw == raw.lstrip(): # no leading space → not word-initial result.append(tok) continue if raw.lstrip().lower() in domain_lower: result.append({**tok, "type": "ROOT", "_domain": True}) else: result.append(tok) return result def _add_output_fields(tokens: list[dict]) -> list[dict]: """Compute token_type and morph_pos and add them to every token.""" result = [] word_pos = 0 for tok in tokens: raw = tok["token"] base_type = tok["type"] stripped = raw.strip() # ── token_type: FOREIGN for foreign ROOTs ───────────────────────── if base_type == "ROOT" and tok.get("_foreign"): token_type = "FOREIGN" else: token_type = base_type # ── morph_pos ───────────────────────────────────────────────────── is_word_start = raw.startswith(" ") or stripped.startswith("<") if is_word_start or base_type in _SPECIAL_TYPES or base_type == "PUNCT": word_pos = 0 morph_pos = 0 elif base_type == "SUFFIX": word_pos += 1 morph_pos = word_pos else: # ROOT or BPE within a word (no leading space) word_pos = 0 morph_pos = 0 result.append({**tok, "token_type": token_type, "morph_pos": morph_pos}) return result