NedoTurkishTokenizer / tokenization_nedo_turkish.py
nmstech's picture
Rename project from TurkTokenizer to NedoTurkishTokenizer
cfffd93
"""
NedoTurkishTokenizer β€” HuggingFace AutoTokenizer compatible class.
Usage:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
out = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m")
out["input_ids"] # hash-stable int IDs of morphological tokens
out["attention_mask"] # all 1s
out["token_type_ids"] # 0=root/other, 1=suffix
out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
"""
from __future__ import annotations
import hashlib
from typing import Any
from transformers import PreTrainedTokenizer
# ── Morphological type β†’ token_type_id ───────────────────────────────────────
_MTYPE_ID = {
"ROOT": 0,
"FOREIGN": 0,
"SUFFIX": 1,
"BPE": 2,
"PUNCT": 3,
"NUM": 4,
"DATE": 4,
"UNIT": 4,
"URL": 5,
"MENTION": 5,
"HASHTAG": 5,
"EMOJI": 5,
}
def _stable_hash(s: str) -> int:
"""MD5-based stable hash that does NOT change between Python runs."""
return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
class NedoTurkishTokenizer(PreTrainedTokenizer):
"""
Turkish morphological tokenizer β€” HuggingFace compatible.
``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
For downstream transformer use, embed by ``token_type_ids`` or learn a
projection from the ``morphological_tokens`` metadata.
All standard HuggingFace fields are present:
input_ids, attention_mask, token_type_ids
Extra field:
morphological_tokens β€” list[dict] with token, token_type, morph_pos, ...
"""
vocab_files_names: dict = {}
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
self._morph: "NedoTurkishTokenizer_core | None" = None # lazy init
def _get_morph(self):
if self._morph is None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core # noqa: PLC0415
self._morph = _Core()
return self._morph
# ── PreTrainedTokenizer required interface ────────────────────────────────
@property
def vocab_size(self) -> int:
return 16_777_216 # 2^24 β€” MD5 hash space
def get_vocab(self) -> dict:
return {} # no fixed vocabulary
def _tokenize(self, text: str) -> list[str]:
"""Return token strings from the morphological pipeline."""
tokens = self._get_morph().tokenize(text)
return [t["token"] for t in tokens]
def _convert_token_to_id(self, token: str) -> int:
return _stable_hash(token)
def _convert_id_to_token(self, index: int) -> str:
return "" # no inverse mapping without a vocab
def save_vocabulary(
self,
save_directory: str,
filename_prefix: str | None = None,
) -> tuple:
return () # no vocab file
# ── Main __call__ override ────────────────────────────────────────────────
def __call__(
self,
text: str | list[str],
return_morphological_tokens: bool = True,
**kwargs: Any,
) -> dict:
"""
Tokenize text and return a dict with standard HuggingFace fields
plus ``morphological_tokens``.
Args:
text: Single string or list of strings.
return_morphological_tokens: Include full morphological dicts.
Returns:
dict with:
input_ids : list[int] or list[list[int]]
attention_mask : list[int] or list[list[int]]
token_type_ids : list[int] or list[list[int]]
morphological_tokens : list[dict] or list[list[dict]]
"""
if isinstance(text, list):
results = [self._encode_single(t, return_morphological_tokens) for t in text]
return {
"input_ids": [r["input_ids"] for r in results],
"attention_mask": [r["attention_mask"] for r in results],
"token_type_ids": [r["token_type_ids"] for r in results],
"morphological_tokens": [r["morphological_tokens"] for r in results],
}
return self._encode_single(text, return_morphological_tokens)
def _encode_single(self, text: str, with_morph: bool) -> dict:
morph = self._get_morph()
tokens = morph.tokenize(text)
input_ids = [_stable_hash(t["token"]) for t in tokens]
attn_mask = [1] * len(tokens)
type_ids = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]
out: dict = {
"input_ids": input_ids,
"attention_mask": attn_mask,
"token_type_ids": type_ids,
}
if with_morph:
out["morphological_tokens"] = tokens
return out
# ── Convenience helpers ───────────────────────────────────────────────────
def encode(self, text: str, **kwargs) -> list[int]: # type: ignore[override]
return self._encode_single(text, with_morph=False)["input_ids"]
def decode(self, token_ids: list[int], **kwargs) -> str: # type: ignore[override]
"""Not meaningful without a fixed vocab β€” returns empty string."""
return ""
def tokenize(self, text: str, **kwargs) -> list[str]:
return self._tokenize(text)
def morphological_tokenize(self, text: str) -> list[dict]:
"""Return full morphological token dicts (main NedoTurkishTokenizer output)."""
return self._get_morph().tokenize(text)
def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:
"""Parallel morphological tokenization."""
return self._get_morph().batch_tokenize(texts, workers=workers)
def stats(self, tokens: list[dict]) -> dict:
"""Compute TR% and other morphological coverage stats."""
return self._get_morph().stats(tokens)