File size: 6,453 Bytes

"""
NedoTurkishTokenizer — HuggingFace AutoTokenizer compatible class.

Usage:
    from transformers import AutoTokenizer

    tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
    out = tok("İstanbul'da meeting'e katılamadım")

    out["input_ids"]            # hash-stable int IDs of morphological tokens
    out["attention_mask"]       # all 1s
    out["token_type_ids"]       # 0=root/other, 1=suffix
    out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
"""

from __future__ import annotations

import hashlib
from typing import Any

from transformers import PreTrainedTokenizer

# ── Morphological type → token_type_id ───────────────────────────────────────
_MTYPE_ID = {
    "ROOT":    0,
    "FOREIGN": 0,
    "SUFFIX":  1,
    "BPE":     2,
    "PUNCT":   3,
    "NUM":     4,
    "DATE":    4,
    "UNIT":    4,
    "URL":     5,
    "MENTION": 5,
    "HASHTAG": 5,
    "EMOJI":   5,
}


def _stable_hash(s: str) -> int:
    """MD5-based stable hash that does NOT change between Python runs."""
    return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)


class NedoTurkishTokenizer(PreTrainedTokenizer):
    """
    Turkish morphological tokenizer — HuggingFace compatible.

    ``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
    For downstream transformer use, embed by ``token_type_ids`` or learn a
    projection from the ``morphological_tokens`` metadata.

    All standard HuggingFace fields are present:
        input_ids, attention_mask, token_type_ids

    Extra field:
        morphological_tokens — list[dict] with token, token_type, morph_pos, ...
    """

    vocab_files_names: dict = {}
    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]

    def __init__(self, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self._morph: "NedoTurkishTokenizer_core | None" = None  # lazy init

    def _get_morph(self):
        if self._morph is None:
            from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core  # noqa: PLC0415
            self._morph = _Core()
        return self._morph

    # ── PreTrainedTokenizer required interface ────────────────────────────────

    @property
    def vocab_size(self) -> int:
        return 16_777_216   # 2^24 — MD5 hash space

    def get_vocab(self) -> dict:
        return {}           # no fixed vocabulary

    def _tokenize(self, text: str) -> list[str]:
        """Return token strings from the morphological pipeline."""
        tokens = self._get_morph().tokenize(text)
        return [t["token"] for t in tokens]

    def _convert_token_to_id(self, token: str) -> int:
        return _stable_hash(token)

    def _convert_id_to_token(self, index: int) -> str:
        return ""           # no inverse mapping without a vocab

    def save_vocabulary(
        self,
        save_directory: str,
        filename_prefix: str | None = None,
    ) -> tuple:
        return ()           # no vocab file

    # ── Main __call__ override ────────────────────────────────────────────────

    def __call__(
        self,
        text: str | list[str],
        return_morphological_tokens: bool = True,
        **kwargs: Any,
    ) -> dict:
        """
        Tokenize text and return a dict with standard HuggingFace fields
        plus ``morphological_tokens``.

        Args:
            text: Single string or list of strings.
            return_morphological_tokens: Include full morphological dicts.

        Returns:
            dict with:
                input_ids            : list[int] or list[list[int]]
                attention_mask       : list[int] or list[list[int]]
                token_type_ids       : list[int] or list[list[int]]
                morphological_tokens : list[dict] or list[list[dict]]
        """
        if isinstance(text, list):
            results = [self._encode_single(t, return_morphological_tokens) for t in text]
            return {
                "input_ids":            [r["input_ids"] for r in results],
                "attention_mask":       [r["attention_mask"] for r in results],
                "token_type_ids":       [r["token_type_ids"] for r in results],
                "morphological_tokens": [r["morphological_tokens"] for r in results],
            }
        return self._encode_single(text, return_morphological_tokens)

    def _encode_single(self, text: str, with_morph: bool) -> dict:
        morph = self._get_morph()
        tokens = morph.tokenize(text)

        input_ids   = [_stable_hash(t["token"]) for t in tokens]
        attn_mask   = [1] * len(tokens)
        type_ids    = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]

        out: dict = {
            "input_ids":      input_ids,
            "attention_mask": attn_mask,
            "token_type_ids": type_ids,
        }
        if with_morph:
            out["morphological_tokens"] = tokens
        return out

    # ── Convenience helpers ───────────────────────────────────────────────────

    def encode(self, text: str, **kwargs) -> list[int]:  # type: ignore[override]
        return self._encode_single(text, with_morph=False)["input_ids"]

    def decode(self, token_ids: list[int], **kwargs) -> str:  # type: ignore[override]
        """Not meaningful without a fixed vocab — returns empty string."""
        return ""

    def tokenize(self, text: str, **kwargs) -> list[str]:
        return self._tokenize(text)

    def morphological_tokenize(self, text: str) -> list[dict]:
        """Return full morphological token dicts (main NedoTurkishTokenizer output)."""
        return self._get_morph().tokenize(text)

    def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:
        """Parallel morphological tokenization."""
        return self._get_morph().batch_tokenize(texts, workers=workers)

    def stats(self, tokens: list[dict]) -> dict:
        """Compute TR% and other morphological coverage stats."""
        return self._get_morph().stats(tokens)