Ethosoft
/

NedoTurkishTokenizer

+"""
+TurkTokenizer — HuggingFace AutoTokenizer compatible class.
+Usage:
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained("Ethosoft/turk-tokenizer", trust_remote_code=True)
+    out = tok("İstanbul'da meeting'e katılamadım")
+    out["input_ids"]            # hash-stable int IDs of morphological tokens
+    out["attention_mask"]       # all 1s
+    out["token_type_ids"]       # 0=root/other, 1=suffix
+    out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
+"""
+from __future__ import annotations
+import hashlib
+from typing import Any
+from transformers import PreTrainedTokenizer
+# ── Morphological type → token_type_id ───────────────────────────────────────
+_MTYPE_ID = {
+    "ROOT":    0,
+    "FOREIGN": 0,
+    "SUFFIX":  1,
+    "BPE":     2,
+    "PUNCT":   3,
+    "NUM":     4,
+    "DATE":    4,
+    "UNIT":    4,
+    "URL":     5,
+    "MENTION": 5,
+    "HASHTAG": 5,
+    "EMOJI":   5,
+}
+def _stable_hash(s: str) -> int:
+    """MD5-based stable hash that does NOT change between Python runs."""
+    return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
+class TurkTokenizer(PreTrainedTokenizer):
+    """
+    Turkish morphological tokenizer — HuggingFace compatible.
+    ``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
+    For downstream transformer use, embed by ``token_type_ids`` or learn a
+    projection from the ``morphological_tokens`` metadata.
+    All standard HuggingFace fields are present:
+        input_ids, attention_mask, token_type_ids
+    Extra field:
+        morphological_tokens — list[dict] with token, token_type, morph_pos, ...
+    """
+    vocab_files_names: dict = {}
+    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self._morph: "TurkTokenizer_core | None" = None  # lazy init
+    def _get_morph(self):
+        if self._morph is None:
+            from turk_tokenizer import TurkTokenizer as _Core  # noqa: PLC0415
+            self._morph = _Core()
+        return self._morph
+    # ── PreTrainedTokenizer required interface ────────────────────────────────
+    @property
+    def vocab_size(self) -> int:
+        return 16_777_216   # 2^24 — MD5 hash space
+    def get_vocab(self) -> dict:
+        return {}           # no fixed vocabulary
+    def _tokenize(self, text: str) -> list[str]:
+        """Return token strings from the morphological pipeline."""
+        tokens = self._get_morph().tokenize(text)
+        return [t["token"] for t in tokens]
+    def _convert_token_to_id(self, token: str) -> int:
+        return _stable_hash(token)
+    def _convert_id_to_token(self, index: int) -> str:
+        return ""           # no inverse mapping without a vocab
+    def save_vocabulary(
+        self,
+        save_directory: str,
+        filename_prefix: str | None = None,
+    ) -> tuple:
+        return ()           # no vocab file
+    # ── Main __call__ override ────────────────────────────────────────────────
+    def __call__(
+        self,
+        text: str | list[str],
+        return_morphological_tokens: bool = True,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Tokenize text and return a dict with standard HuggingFace fields
+        plus ``morphological_tokens``.
+        Args:
+            text: Single string or list of strings.
+            return_morphological_tokens: Include full morphological dicts.
+        Returns:
+            dict with:
+                input_ids            : list[int] or list[list[int]]
+                attention_mask       : list[int] or list[list[int]]
+                token_type_ids       : list[int] or list[list[int]]
+                morphological_tokens : list[dict] or list[list[dict]]
+        """
+        if isinstance(text, list):
+            results = [self._encode_single(t, return_morphological_tokens) for t in text]
+            return {
+                "input_ids":            [r["input_ids"] for r in results],
+                "attention_mask":       [r["attention_mask"] for r in results],
+                "token_type_ids":       [r["token_type_ids"] for r in results],
+                "morphological_tokens": [r["morphological_tokens"] for r in results],
+            }
+        return self._encode_single(text, return_morphological_tokens)
+    def _encode_single(self, text: str, with_morph: bool) -> dict:
+        morph = self._get_morph()
+        tokens = morph.tokenize(text)
+        input_ids   = [_stable_hash(t["token"]) for t in tokens]
+        attn_mask   = [1] * len(tokens)
+        type_ids    = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]
+        out: dict = {
+            "input_ids":      input_ids,
+            "attention_mask": attn_mask,
+            "token_type_ids": type_ids,
+        }
+        if with_morph:
+            out["morphological_tokens"] = tokens
+        return out
+    # ── Convenience helpers ───────────────────────────────────────────────────
+    def encode(self, text: str, **kwargs) -> list[int]:  # type: ignore[override]
+        return self._encode_single(text, with_morph=False)["input_ids"]
+    def decode(self, token_ids: list[int], **kwargs) -> str:  # type: ignore[override]
+        """Not meaningful without a fixed vocab — returns empty string."""
+        return ""
+    def tokenize(self, text: str, **kwargs) -> list[str]:
+        return self._tokenize(text)
+    def morphological_tokenize(self, text: str) -> list[dict]:
+        """Return full morphological token dicts (main TurkTokenizer output)."""
+        return self._get_morph().tokenize(text)
+    def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:
+        """Parallel morphological tokenization."""
+        return self._get_morph().batch_tokenize(texts, workers=workers)
+    def stats(self, tokens: list[dict]) -> dict:
+        """Compute TR% and other morphological coverage stats."""
+        return self._get_morph().stats(tokens)