| """ |
| NedoTurkishTokenizer β HuggingFace AutoTokenizer compatible class. |
| |
| Usage: |
| from transformers import AutoTokenizer |
| |
| tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True) |
| out = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m") |
| |
| out["input_ids"] # hash-stable int IDs of morphological tokens |
| out["attention_mask"] # all 1s |
| out["token_type_ids"] # 0=root/other, 1=suffix |
| out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...) |
| """ |
|
|
| from __future__ import annotations |
|
|
| import hashlib |
| from typing import Any |
|
|
| from transformers import PreTrainedTokenizer |
|
|
| |
| _MTYPE_ID = { |
| "ROOT": 0, |
| "FOREIGN": 0, |
| "SUFFIX": 1, |
| "BPE": 2, |
| "PUNCT": 3, |
| "NUM": 4, |
| "DATE": 4, |
| "UNIT": 4, |
| "URL": 5, |
| "MENTION": 5, |
| "HASHTAG": 5, |
| "EMOJI": 5, |
| } |
|
|
|
|
| def _stable_hash(s: str) -> int: |
| """MD5-based stable hash that does NOT change between Python runs.""" |
| return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16) |
|
|
|
|
| class NedoTurkishTokenizer(PreTrainedTokenizer): |
| """ |
| Turkish morphological tokenizer β HuggingFace compatible. |
| |
| ``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs). |
| For downstream transformer use, embed by ``token_type_ids`` or learn a |
| projection from the ``morphological_tokens`` metadata. |
| |
| All standard HuggingFace fields are present: |
| input_ids, attention_mask, token_type_ids |
| |
| Extra field: |
| morphological_tokens β list[dict] with token, token_type, morph_pos, ... |
| """ |
|
|
| vocab_files_names: dict = {} |
| model_input_names = ["input_ids", "attention_mask", "token_type_ids"] |
|
|
| def __init__(self, **kwargs: Any) -> None: |
| super().__init__(**kwargs) |
| self._morph: "NedoTurkishTokenizer_core | None" = None |
|
|
| def _get_morph(self): |
| if self._morph is None: |
| from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core |
| self._morph = _Core() |
| return self._morph |
|
|
| |
|
|
| @property |
| def vocab_size(self) -> int: |
| return 16_777_216 |
|
|
| def get_vocab(self) -> dict: |
| return {} |
|
|
| def _tokenize(self, text: str) -> list[str]: |
| """Return token strings from the morphological pipeline.""" |
| tokens = self._get_morph().tokenize(text) |
| return [t["token"] for t in tokens] |
|
|
| def _convert_token_to_id(self, token: str) -> int: |
| return _stable_hash(token) |
|
|
| def _convert_id_to_token(self, index: int) -> str: |
| return "" |
|
|
| def save_vocabulary( |
| self, |
| save_directory: str, |
| filename_prefix: str | None = None, |
| ) -> tuple: |
| return () |
|
|
| |
|
|
| def __call__( |
| self, |
| text: str | list[str], |
| return_morphological_tokens: bool = True, |
| **kwargs: Any, |
| ) -> dict: |
| """ |
| Tokenize text and return a dict with standard HuggingFace fields |
| plus ``morphological_tokens``. |
| |
| Args: |
| text: Single string or list of strings. |
| return_morphological_tokens: Include full morphological dicts. |
| |
| Returns: |
| dict with: |
| input_ids : list[int] or list[list[int]] |
| attention_mask : list[int] or list[list[int]] |
| token_type_ids : list[int] or list[list[int]] |
| morphological_tokens : list[dict] or list[list[dict]] |
| """ |
| if isinstance(text, list): |
| results = [self._encode_single(t, return_morphological_tokens) for t in text] |
| return { |
| "input_ids": [r["input_ids"] for r in results], |
| "attention_mask": [r["attention_mask"] for r in results], |
| "token_type_ids": [r["token_type_ids"] for r in results], |
| "morphological_tokens": [r["morphological_tokens"] for r in results], |
| } |
| return self._encode_single(text, return_morphological_tokens) |
|
|
| def _encode_single(self, text: str, with_morph: bool) -> dict: |
| morph = self._get_morph() |
| tokens = morph.tokenize(text) |
|
|
| input_ids = [_stable_hash(t["token"]) for t in tokens] |
| attn_mask = [1] * len(tokens) |
| type_ids = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens] |
|
|
| out: dict = { |
| "input_ids": input_ids, |
| "attention_mask": attn_mask, |
| "token_type_ids": type_ids, |
| } |
| if with_morph: |
| out["morphological_tokens"] = tokens |
| return out |
|
|
| |
|
|
| def encode(self, text: str, **kwargs) -> list[int]: |
| return self._encode_single(text, with_morph=False)["input_ids"] |
|
|
| def decode(self, token_ids: list[int], **kwargs) -> str: |
| """Not meaningful without a fixed vocab β returns empty string.""" |
| return "" |
|
|
| def tokenize(self, text: str, **kwargs) -> list[str]: |
| return self._tokenize(text) |
|
|
| def morphological_tokenize(self, text: str) -> list[dict]: |
| """Return full morphological token dicts (main NedoTurkishTokenizer output).""" |
| return self._get_morph().tokenize(text) |
|
|
| def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]: |
| """Parallel morphological tokenization.""" |
| return self._get_morph().batch_tokenize(texts, workers=workers) |
|
|
| def stats(self, tokens: list[dict]) -> dict: |
| """Compute TR% and other morphological coverage stats.""" |
| return self._get_morph().stats(tokens) |
|
|