| """ |
| TurkTokenizer β production-ready Turkish morphological tokenizer. |
| |
| Applies 12 sequential fixes on top of the base turkish-tokenizer: |
| 1. ALL CAPS inflation fix |
| 2. Apostrophe / code-switching split |
| 3. BPEβSUFFIX reclassification |
| 4. Zemberek root validation & correction |
| 5. Punctuation β PUNCT type |
| 6. Domain vocabulary (medical / sports / tourism) |
| 7. TDK-based FOREIGN word detection |
| 8. Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI) |
| 9. Allomorph canonicalization |
| 10. Compound word decomposition |
| 11. Acronym expansion |
| 12. Context-aware Zemberek disambiguation |
| |
| Output fields per token: |
| token : str β token string (leading space = word-initial) |
| token_type : str β ROOT | SUFFIX | FOREIGN | BPE | PUNCT | |
| NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI |
| morph_pos : int β 0=root/word-initial, 1=first suffix, 2=second suffixβ¦ |
| (+ optional _* metadata fields) |
| """ |
|
|
| from __future__ import annotations |
|
|
| import os |
| import multiprocessing |
| from concurrent.futures import ProcessPoolExecutor, as_completed |
| from pathlib import Path |
|
|
| from ._java_check import ensure_java |
| from ._preprocessor import preprocess, postprocess |
| from ._suffix_expander import reclassify_bpe_suffixes |
| from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE |
| from ._medical_vocab import ALL_DOMAIN_ROOTS |
| from ._tdk_vocab import reclassify_foreign_words |
| from ._normalizer import ( |
| preprocess_special_tokens, |
| restore_special_tokens, |
| reclassify_numbers_in_tokens, |
| ) |
| from ._allomorph import add_canonical_labels |
| from ._compound import add_compound_info |
| from ._acronym_dict import reclassify_acronyms |
| from ._context_aware import annotate_with_context |
|
|
| try: |
| from ._root_validator import _morphology as _zemb_morphology |
| except Exception: |
| _zemb_morphology = None |
|
|
| _DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS} |
|
|
| |
|
|
| _SPECIAL_TYPES = frozenset( |
| ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI") |
| ) |
|
|
| _TYPE_SYM = { |
| "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P", |
| "NUM": "N", "DATE": "D", "UNIT": "U", |
| "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E", |
| } |
|
|
|
|
| |
|
|
| _worker_tok: "TurkTokenizer | None" = None |
|
|
|
|
| def _init_worker() -> None: |
| global _worker_tok |
| _worker_tok = TurkTokenizer() |
|
|
|
|
| def _tokenize_one(text: str) -> list[dict]: |
| assert _worker_tok is not None |
| return _worker_tok.tokenize(text) |
|
|
|
|
| |
|
|
| class TurkTokenizer: |
| """ |
| Turkish morphological tokenizer with HuggingFace-compatible interface. |
| |
| Example:: |
| |
| from turk_tokenizer import TurkTokenizer |
| |
| tok = TurkTokenizer() |
| tokens = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m") |
| for t in tokens: |
| print(t["token"], t["token_type"], t["morph_pos"]) |
| """ |
|
|
| def __init__(self) -> None: |
| ensure_java() |
| from turkish_tokenizer import TurkishTokenizer |
| self._base = TurkishTokenizer() |
| self.zemberek_available = ZEMBEREK_AVAILABLE |
|
|
| |
|
|
| def __call__(self, text: str) -> list[dict]: |
| return self.tokenize(text) |
|
|
| def tokenize(self, text: str) -> list[dict]: |
| """Tokenize a single text string. |
| |
| Returns a list of token dicts, each with: |
| ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields. |
| """ |
| |
| text_norm, specials = preprocess_special_tokens(text) |
|
|
| |
| processed, caps_map = preprocess(text_norm) |
|
|
| |
| raw = self._base.tokenize_text(processed) |
|
|
| |
| tokens = restore_special_tokens(raw, specials) |
|
|
| |
| tokens = postprocess(tokens, caps_map) |
|
|
| |
| tokens = reclassify_bpe_suffixes(tokens) |
|
|
| |
| tokens = reclassify_numbers_in_tokens(tokens) |
|
|
| |
| tokens = _reclassify_domain_roots(tokens, _DOMAIN_ROOTS_LOWER) |
|
|
| |
| tokens = reclassify_foreign_words(tokens) |
|
|
| |
| tokens = reclassify_acronyms(tokens) |
|
|
| |
| tokens = add_canonical_labels(tokens) |
|
|
| |
| tokens = add_compound_info(tokens, morphology=_zemb_morphology) |
|
|
| |
| tokens = annotate_with_context(tokens, text) |
|
|
| |
| tokens = validate_roots(tokens, text.split(), base_tokenizer=self._base) |
|
|
| |
| tokens = _add_output_fields(tokens) |
|
|
| return tokens |
|
|
| def batch_tokenize( |
| self, |
| texts: list[str], |
| workers: int | None = None, |
| chunk_size: int = 64, |
| ) -> list[list[dict]]: |
| """Tokenize a list of texts in parallel. |
| |
| Args: |
| texts: List of strings to tokenize. |
| workers: Number of worker processes (None = all CPUs). |
| chunk_size: Below this count, run sequentially to avoid overhead. |
| |
| Returns: |
| List of token lists, in the same order as ``texts``. |
| """ |
| if not texts: |
| return [] |
|
|
| n = workers or os.cpu_count() or 4 |
|
|
| if len(texts) <= chunk_size or n == 1: |
| return [self.tokenize(t) for t in texts] |
|
|
| results: list[list[dict] | None] = [None] * len(texts) |
|
|
| with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool: |
| futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)} |
| for fut in as_completed(futs): |
| i = futs[fut] |
| try: |
| results[i] = fut.result() |
| except Exception as exc: |
| results[i] = self._base.tokenize_text(texts[i]) |
| print(f"[TurkTokenizer] fallback at idx={i}: {exc}") |
|
|
| return results |
|
|
| |
|
|
| @classmethod |
| def from_pretrained(cls, _model_id: str = "Ethosoft/turk-tokenizer") -> "TurkTokenizer": |
| """Load tokenizer (rules-based, no weights to download).""" |
| return cls() |
|
|
| def save_pretrained(self, save_directory: str) -> None: |
| """Save tokenizer config to a directory (for HF Hub compatibility).""" |
| import json |
| path = Path(save_directory) |
| path.mkdir(parents=True, exist_ok=True) |
| config = { |
| "tokenizer_class": "TurkTokenizer", |
| "model_type": "turk-tokenizer", |
| "version": "1.0.0", |
| "zemberek_available": self.zemberek_available, |
| } |
| (path / "tokenizer_config.json").write_text( |
| json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8" |
| ) |
|
|
| |
|
|
| def stats(self, tokens: list[dict]) -> dict: |
| """Compute morphological coverage statistics for a token list.""" |
| total = len(tokens) |
| if total == 0: |
| return {k: 0 for k in ("total", "roots", "suffixes", "foreign", |
| "bpe", "punct", "special", "tr_pct", "pure_pct")} |
| roots = sum(1 for t in tokens if t["token_type"] == "ROOT") |
| suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX") |
| foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN") |
| punct = sum(1 for t in tokens if t["token_type"] == "PUNCT") |
| bpe = sum(1 for t in tokens if t["token_type"] == "BPE") |
| special = sum(1 for t in tokens if t["token_type"] in _SPECIAL_TYPES) |
| tr = roots + suffixes + foreign + punct + special |
| pure = sum( |
| 1 for t in tokens |
| if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN") |
| and not t["token"].strip().startswith("<") |
| ) |
| return { |
| "total": total, |
| "roots": roots, |
| "suffixes": suffixes, |
| "foreign": foreign, |
| "bpe": bpe, |
| "punct": punct, |
| "special": special, |
| "tr_pct": round(tr / total * 100, 2), |
| "pure_pct": round(pure / total * 100, 2), |
| } |
|
|
|
|
| |
|
|
| def _reclassify_domain_roots(tokens: list[dict], domain_lower: set) -> list[dict]: |
| result = [] |
| for tok in tokens: |
| if tok["type"] != "BPE": |
| result.append(tok) |
| continue |
| raw = tok["token"] |
| if raw == raw.lstrip(): |
| result.append(tok) |
| continue |
| if raw.lstrip().lower() in domain_lower: |
| result.append({**tok, "type": "ROOT", "_domain": True}) |
| else: |
| result.append(tok) |
| return result |
|
|
|
|
| def _add_output_fields(tokens: list[dict]) -> list[dict]: |
| """Compute token_type and morph_pos and add them to every token.""" |
| result = [] |
| word_pos = 0 |
|
|
| for tok in tokens: |
| raw = tok["token"] |
| base_type = tok["type"] |
| stripped = raw.strip() |
|
|
| |
| if base_type == "ROOT" and tok.get("_foreign"): |
| token_type = "FOREIGN" |
| else: |
| token_type = base_type |
|
|
| |
| is_word_start = raw.startswith(" ") or stripped.startswith("<") |
|
|
| if is_word_start or base_type in _SPECIAL_TYPES or base_type == "PUNCT": |
| word_pos = 0 |
| morph_pos = 0 |
| elif base_type == "SUFFIX": |
| word_pos += 1 |
| morph_pos = word_pos |
| else: |
| |
| word_pos = 0 |
| morph_pos = 0 |
|
|
| result.append({**tok, "token_type": token_type, "morph_pos": morph_pos}) |
|
|
| return result |
|
|