""" NedoTurkishTokenizer — HuggingFace AutoTokenizer compatible class. Usage: from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True) out = tok("İstanbul'da meeting'e katılamadım") out["input_ids"] # hash-stable int IDs of morphological tokens out["attention_mask"] # all 1s out["token_type_ids"] # 0=root/other, 1=suffix out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...) """ from __future__ import annotations import hashlib from typing import Any from transformers import PreTrainedTokenizer # ── Morphological type → token_type_id ─────────────────────────────────────── _MTYPE_ID = { "ROOT": 0, "FOREIGN": 0, "SUFFIX": 1, "BPE": 2, "PUNCT": 3, "NUM": 4, "DATE": 4, "UNIT": 4, "URL": 5, "MENTION": 5, "HASHTAG": 5, "EMOJI": 5, } def _stable_hash(s: str) -> int: """MD5-based stable hash that does NOT change between Python runs.""" return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16) class NedoTurkishTokenizer(PreTrainedTokenizer): """ Turkish morphological tokenizer — HuggingFace compatible. ``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs). For downstream transformer use, embed by ``token_type_ids`` or learn a projection from the ``morphological_tokens`` metadata. All standard HuggingFace fields are present: input_ids, attention_mask, token_type_ids Extra field: morphological_tokens — list[dict] with token, token_type, morph_pos, ... """ vocab_files_names: dict = {} model_input_names = ["input_ids", "attention_mask", "token_type_ids"] def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) self._morph: "NedoTurkishTokenizer_core | None" = None # lazy init def _get_morph(self): if self._morph is None: from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core # noqa: PLC0415 self._morph = _Core() return self._morph # ── PreTrainedTokenizer required interface ──────────────────────────────── @property def vocab_size(self) -> int: return 16_777_216 # 2^24 — MD5 hash space def get_vocab(self) -> dict: return {} # no fixed vocabulary def _tokenize(self, text: str) -> list[str]: """Return token strings from the morphological pipeline.""" tokens = self._get_morph().tokenize(text) return [t["token"] for t in tokens] def _convert_token_to_id(self, token: str) -> int: return _stable_hash(token) def _convert_id_to_token(self, index: int) -> str: return "" # no inverse mapping without a vocab def save_vocabulary( self, save_directory: str, filename_prefix: str | None = None, ) -> tuple: return () # no vocab file # ── Main __call__ override ──────────────────────────────────────────────── def __call__( self, text: str | list[str], return_morphological_tokens: bool = True, **kwargs: Any, ) -> dict: """ Tokenize text and return a dict with standard HuggingFace fields plus ``morphological_tokens``. Args: text: Single string or list of strings. return_morphological_tokens: Include full morphological dicts. Returns: dict with: input_ids : list[int] or list[list[int]] attention_mask : list[int] or list[list[int]] token_type_ids : list[int] or list[list[int]] morphological_tokens : list[dict] or list[list[dict]] """ if isinstance(text, list): results = [self._encode_single(t, return_morphological_tokens) for t in text] return { "input_ids": [r["input_ids"] for r in results], "attention_mask": [r["attention_mask"] for r in results], "token_type_ids": [r["token_type_ids"] for r in results], "morphological_tokens": [r["morphological_tokens"] for r in results], } return self._encode_single(text, return_morphological_tokens) def _encode_single(self, text: str, with_morph: bool) -> dict: morph = self._get_morph() tokens = morph.tokenize(text) input_ids = [_stable_hash(t["token"]) for t in tokens] attn_mask = [1] * len(tokens) type_ids = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens] out: dict = { "input_ids": input_ids, "attention_mask": attn_mask, "token_type_ids": type_ids, } if with_morph: out["morphological_tokens"] = tokens return out # ── Convenience helpers ─────────────────────────────────────────────────── def encode(self, text: str, **kwargs) -> list[int]: # type: ignore[override] return self._encode_single(text, with_morph=False)["input_ids"] def decode(self, token_ids: list[int], **kwargs) -> str: # type: ignore[override] """Not meaningful without a fixed vocab — returns empty string.""" return "" def tokenize(self, text: str, **kwargs) -> list[str]: return self._tokenize(text) def morphological_tokenize(self, text: str) -> list[dict]: """Return full morphological token dicts (main NedoTurkishTokenizer output).""" return self._get_morph().tokenize(text) def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]: """Parallel morphological tokenization.""" return self._get_morph().batch_tokenize(texts, workers=workers) def stats(self, tokens: list[dict]) -> dict: """Compute TR% and other morphological coverage stats.""" return self._get_morph().stats(tokens)