NedoTurkishTokenizer / tokenization_nedo_turkish.py

Rename project from TurkTokenizer to NedoTurkishTokenizer

cfffd93 1 day ago

6.45 kB

	"""
	NedoTurkishTokenizer — HuggingFace AutoTokenizer compatible class.

	Usage:
	from transformers import AutoTokenizer

	tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
	out = tok("İstanbul'da meeting'e katılamadım")

	out["input_ids"] # hash-stable int IDs of morphological tokens
	out["attention_mask"] # all 1s
	out["token_type_ids"] # 0=root/other, 1=suffix
	out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
	"""

	from __future__ import annotations

	import hashlib
	from typing import Any

	from transformers import PreTrainedTokenizer

	# ── Morphological type → token_type_id ───────────────────────────────────────
	_MTYPE_ID = {
	"ROOT": 0,
	"FOREIGN": 0,
	"SUFFIX": 1,
	"BPE": 2,
	"PUNCT": 3,
	"NUM": 4,
	"DATE": 4,
	"UNIT": 4,
	"URL": 5,
	"MENTION": 5,
	"HASHTAG": 5,
	"EMOJI": 5,
	}


	def _stable_hash(s: str) -> int:
	"""MD5-based stable hash that does NOT change between Python runs."""
	return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)


	class NedoTurkishTokenizer(PreTrainedTokenizer):
	"""
	Turkish morphological tokenizer — HuggingFace compatible.

	``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
	For downstream transformer use, embed by ``token_type_ids`` or learn a
	projection from the ``morphological_tokens`` metadata.

	All standard HuggingFace fields are present:
	input_ids, attention_mask, token_type_ids

	Extra field:
	morphological_tokens — list[dict] with token, token_type, morph_pos, ...
	"""

	vocab_files_names: dict = {}
	model_input_names = ["input_ids", "attention_mask", "token_type_ids"]

	def __init__(self, **kwargs: Any) -> None:
	super().__init__(**kwargs)
	self._morph: "NedoTurkishTokenizer_core \| None" = None # lazy init

	def _get_morph(self):
	if self._morph is None:
	from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core # noqa: PLC0415
	self._morph = _Core()
	return self._morph

	# ── PreTrainedTokenizer required interface ────────────────────────────────

	@property
	def vocab_size(self) -> int:
	return 16_777_216 # 2^24 — MD5 hash space

	def get_vocab(self) -> dict:
	return {} # no fixed vocabulary

	def _tokenize(self, text: str) -> list[str]:
	"""Return token strings from the morphological pipeline."""
	tokens = self._get_morph().tokenize(text)
	return [t["token"] for t in tokens]

	def _convert_token_to_id(self, token: str) -> int:
	return _stable_hash(token)

	def _convert_id_to_token(self, index: int) -> str:
	return "" # no inverse mapping without a vocab

	def save_vocabulary(
	self,
	save_directory: str,
	filename_prefix: str \| None = None,
	) -> tuple:
	return () # no vocab file

	# ── Main __call__ override ────────────────────────────────────────────────

	def __call__(
	self,
	text: str \| list[str],
	return_morphological_tokens: bool = True,
	**kwargs: Any,
	) -> dict:
	"""
	Tokenize text and return a dict with standard HuggingFace fields
	plus ``morphological_tokens``.

	Args:
	text: Single string or list of strings.
	return_morphological_tokens: Include full morphological dicts.

	Returns:
	dict with:
	input_ids : list[int] or list[list[int]]
	attention_mask : list[int] or list[list[int]]
	token_type_ids : list[int] or list[list[int]]
	morphological_tokens : list[dict] or list[list[dict]]
	"""
	if isinstance(text, list):
	results = [self._encode_single(t, return_morphological_tokens) for t in text]
	return {
	"input_ids": [r["input_ids"] for r in results],
	"attention_mask": [r["attention_mask"] for r in results],
	"token_type_ids": [r["token_type_ids"] for r in results],
	"morphological_tokens": [r["morphological_tokens"] for r in results],
	}
	return self._encode_single(text, return_morphological_tokens)

	def _encode_single(self, text: str, with_morph: bool) -> dict:
	morph = self._get_morph()
	tokens = morph.tokenize(text)

	input_ids = [_stable_hash(t["token"]) for t in tokens]
	attn_mask = [1] * len(tokens)
	type_ids = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]

	out: dict = {
	"input_ids": input_ids,
	"attention_mask": attn_mask,
	"token_type_ids": type_ids,
	}
	if with_morph:
	out["morphological_tokens"] = tokens
	return out

	# ── Convenience helpers ───────────────────────────────────────────────────

	def encode(self, text: str, **kwargs) -> list[int]: # type: ignore[override]
	return self._encode_single(text, with_morph=False)["input_ids"]

	def decode(self, token_ids: list[int], **kwargs) -> str: # type: ignore[override]
	"""Not meaningful without a fixed vocab — returns empty string."""
	return ""

	def tokenize(self, text: str, **kwargs) -> list[str]:
	return self._tokenize(text)

	def morphological_tokenize(self, text: str) -> list[dict]:
	"""Return full morphological token dicts (main NedoTurkishTokenizer output)."""
	return self._get_morph().tokenize(text)

	def batch_tokenize(self, texts: list[str], workers: int \| None = None) -> list[list[dict]]:
	"""Parallel morphological tokenization."""
	return self._get_morph().batch_tokenize(texts, workers=workers)

	def stats(self, tokens: list[dict]) -> dict:
	"""Compute TR% and other morphological coverage stats."""
	return self._get_morph().stats(tokens)