nmstech's picture
Initial release: TurkTokenizer v1.0.0 β€” TR-MMLU 92%
ca41c16 verified
"""
TurkTokenizer β€” production-ready Turkish morphological tokenizer.
Applies 12 sequential fixes on top of the base turkish-tokenizer:
1. ALL CAPS inflation fix
2. Apostrophe / code-switching split
3. BPE→SUFFIX reclassification
4. Zemberek root validation & correction
5. Punctuation β†’ PUNCT type
6. Domain vocabulary (medical / sports / tourism)
7. TDK-based FOREIGN word detection
8. Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)
9. Allomorph canonicalization
10. Compound word decomposition
11. Acronym expansion
12. Context-aware Zemberek disambiguation
Output fields per token:
token : str β€” token string (leading space = word-initial)
token_type : str β€” ROOT | SUFFIX | FOREIGN | BPE | PUNCT |
NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
morph_pos : int β€” 0=root/word-initial, 1=first suffix, 2=second suffix…
(+ optional _* metadata fields)
"""
from __future__ import annotations
import os
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from ._java_check import ensure_java
from ._preprocessor import preprocess, postprocess
from ._suffix_expander import reclassify_bpe_suffixes
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
from ._medical_vocab import ALL_DOMAIN_ROOTS
from ._tdk_vocab import reclassify_foreign_words
from ._normalizer import (
preprocess_special_tokens,
restore_special_tokens,
reclassify_numbers_in_tokens,
)
from ._allomorph import add_canonical_labels
from ._compound import add_compound_info
from ._acronym_dict import reclassify_acronyms
from ._context_aware import annotate_with_context
try:
from ._root_validator import _morphology as _zemb_morphology
except Exception:
_zemb_morphology = None
_DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}
# ── Token types ───────────────────────────────────────────────────────────────
_SPECIAL_TYPES = frozenset(
("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI")
)
_TYPE_SYM = {
"ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
"NUM": "N", "DATE": "D", "UNIT": "U",
"URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E",
}
# ── Parallel worker helpers ───────────────────────────────────────────────────
_worker_tok: "TurkTokenizer | None" = None
def _init_worker() -> None:
global _worker_tok
_worker_tok = TurkTokenizer()
def _tokenize_one(text: str) -> list[dict]:
assert _worker_tok is not None
return _worker_tok.tokenize(text)
# ══════════════════════════════════════════════════════════════════════════════
class TurkTokenizer:
"""
Turkish morphological tokenizer with HuggingFace-compatible interface.
Example::
from turk_tokenizer import TurkTokenizer
tok = TurkTokenizer()
tokens = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m")
for t in tokens:
print(t["token"], t["token_type"], t["morph_pos"])
"""
def __init__(self) -> None:
ensure_java()
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
self._base = TurkishTokenizer()
self.zemberek_available = ZEMBEREK_AVAILABLE
# ── Public API ────────────────────────────────────────────────────────────
def __call__(self, text: str) -> list[dict]:
return self.tokenize(text)
def tokenize(self, text: str) -> list[dict]:
"""Tokenize a single text string.
Returns a list of token dicts, each with:
``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
"""
# Fix 8 pre: replace URLs, mentions, numbers etc. with placeholders
text_norm, specials = preprocess_special_tokens(text)
# Fix 1 & 2 pre: ALL CAPS + apostrophe
processed, caps_map = preprocess(text_norm)
# Base tokenizer
raw = self._base.tokenize_text(processed)
# Fix 8 post: restore placeholders
tokens = restore_special_tokens(raw, specials)
# Fix 1 & 2 post
tokens = postprocess(tokens, caps_map)
# Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT
tokens = reclassify_bpe_suffixes(tokens)
# Fix 8b: remaining numbers / units
tokens = reclassify_numbers_in_tokens(tokens)
# Fix 6: domain vocabulary (medical / sports / tourism)
tokens = _reclassify_domain_roots(tokens, _DOMAIN_ROOTS_LOWER)
# Fix 7: TDK FOREIGN detection
tokens = reclassify_foreign_words(tokens)
# Fix 11: acronym expansions
tokens = reclassify_acronyms(tokens)
# Fix 9: allomorph canonical labels
tokens = add_canonical_labels(tokens)
# Fix 10: compound word annotation
tokens = add_compound_info(tokens, morphology=_zemb_morphology)
# Fix 12: context-aware Zemberek disambiguation
tokens = annotate_with_context(tokens, text)
# Fix 4: Zemberek root validation & correction
tokens = validate_roots(tokens, text.split(), base_tokenizer=self._base)
# Add public output fields
tokens = _add_output_fields(tokens)
return tokens
def batch_tokenize(
self,
texts: list[str],
workers: int | None = None,
chunk_size: int = 64,
) -> list[list[dict]]:
"""Tokenize a list of texts in parallel.
Args:
texts: List of strings to tokenize.
workers: Number of worker processes (None = all CPUs).
chunk_size: Below this count, run sequentially to avoid overhead.
Returns:
List of token lists, in the same order as ``texts``.
"""
if not texts:
return []
n = workers or os.cpu_count() or 4
if len(texts) <= chunk_size or n == 1:
return [self.tokenize(t) for t in texts]
results: list[list[dict] | None] = [None] * len(texts)
with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool:
futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)}
for fut in as_completed(futs):
i = futs[fut]
try:
results[i] = fut.result()
except Exception as exc: # noqa: BLE001
results[i] = self._base.tokenize_text(texts[i])
print(f"[TurkTokenizer] fallback at idx={i}: {exc}")
return results # type: ignore[return-value]
# ── HuggingFace-style helpers ─────────────────────────────────────────────
@classmethod
def from_pretrained(cls, _model_id: str = "Ethosoft/turk-tokenizer") -> "TurkTokenizer":
"""Load tokenizer (rules-based, no weights to download)."""
return cls()
def save_pretrained(self, save_directory: str) -> None:
"""Save tokenizer config to a directory (for HF Hub compatibility)."""
import json
path = Path(save_directory)
path.mkdir(parents=True, exist_ok=True)
config = {
"tokenizer_class": "TurkTokenizer",
"model_type": "turk-tokenizer",
"version": "1.0.0",
"zemberek_available": self.zemberek_available,
}
(path / "tokenizer_config.json").write_text(
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
)
# ── Utility ───────────────────────────────────────────────────────────────
def stats(self, tokens: list[dict]) -> dict:
"""Compute morphological coverage statistics for a token list."""
total = len(tokens)
if total == 0:
return {k: 0 for k in ("total", "roots", "suffixes", "foreign",
"bpe", "punct", "special", "tr_pct", "pure_pct")}
roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
bpe = sum(1 for t in tokens if t["token_type"] == "BPE")
special = sum(1 for t in tokens if t["token_type"] in _SPECIAL_TYPES)
tr = roots + suffixes + foreign + punct + special
pure = sum(
1 for t in tokens
if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
and not t["token"].strip().startswith("<")
)
return {
"total": total,
"roots": roots,
"suffixes": suffixes,
"foreign": foreign,
"bpe": bpe,
"punct": punct,
"special": special,
"tr_pct": round(tr / total * 100, 2),
"pure_pct": round(pure / total * 100, 2),
}
# ── Internal helpers ──────────────────────────────────────────────────────────
def _reclassify_domain_roots(tokens: list[dict], domain_lower: set) -> list[dict]:
result = []
for tok in tokens:
if tok["type"] != "BPE":
result.append(tok)
continue
raw = tok["token"]
if raw == raw.lstrip(): # no leading space β†’ not word-initial
result.append(tok)
continue
if raw.lstrip().lower() in domain_lower:
result.append({**tok, "type": "ROOT", "_domain": True})
else:
result.append(tok)
return result
def _add_output_fields(tokens: list[dict]) -> list[dict]:
"""Compute token_type and morph_pos and add them to every token."""
result = []
word_pos = 0
for tok in tokens:
raw = tok["token"]
base_type = tok["type"]
stripped = raw.strip()
# ── token_type: FOREIGN for foreign ROOTs ─────────────────────────
if base_type == "ROOT" and tok.get("_foreign"):
token_type = "FOREIGN"
else:
token_type = base_type
# ── morph_pos ─────────────────────────────────────────────────────
is_word_start = raw.startswith(" ") or stripped.startswith("<")
if is_word_start or base_type in _SPECIAL_TYPES or base_type == "PUNCT":
word_pos = 0
morph_pos = 0
elif base_type == "SUFFIX":
word_pos += 1
morph_pos = word_pos
else:
# ROOT or BPE within a word (no leading space)
word_pos = 0
morph_pos = 0
result.append({**tok, "token_type": token_type, "morph_pos": morph_pos})
return result