File size: 6,453 Bytes
fffa764 cfffd93 fffa764 cfffd93 fffa764 cfffd93 fffa764 cfffd93 fffa764 cfffd93 fffa764 cfffd93 fffa764 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | """
NedoTurkishTokenizer β HuggingFace AutoTokenizer compatible class.
Usage:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
out = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m")
out["input_ids"] # hash-stable int IDs of morphological tokens
out["attention_mask"] # all 1s
out["token_type_ids"] # 0=root/other, 1=suffix
out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
"""
from __future__ import annotations
import hashlib
from typing import Any
from transformers import PreTrainedTokenizer
# ββ Morphological type β token_type_id βββββββββββββββββββββββββββββββββββββββ
_MTYPE_ID = {
"ROOT": 0,
"FOREIGN": 0,
"SUFFIX": 1,
"BPE": 2,
"PUNCT": 3,
"NUM": 4,
"DATE": 4,
"UNIT": 4,
"URL": 5,
"MENTION": 5,
"HASHTAG": 5,
"EMOJI": 5,
}
def _stable_hash(s: str) -> int:
"""MD5-based stable hash that does NOT change between Python runs."""
return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
class NedoTurkishTokenizer(PreTrainedTokenizer):
"""
Turkish morphological tokenizer β HuggingFace compatible.
``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
For downstream transformer use, embed by ``token_type_ids`` or learn a
projection from the ``morphological_tokens`` metadata.
All standard HuggingFace fields are present:
input_ids, attention_mask, token_type_ids
Extra field:
morphological_tokens β list[dict] with token, token_type, morph_pos, ...
"""
vocab_files_names: dict = {}
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
self._morph: "NedoTurkishTokenizer_core | None" = None # lazy init
def _get_morph(self):
if self._morph is None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core # noqa: PLC0415
self._morph = _Core()
return self._morph
# ββ PreTrainedTokenizer required interface ββββββββββββββββββββββββββββββββ
@property
def vocab_size(self) -> int:
return 16_777_216 # 2^24 β MD5 hash space
def get_vocab(self) -> dict:
return {} # no fixed vocabulary
def _tokenize(self, text: str) -> list[str]:
"""Return token strings from the morphological pipeline."""
tokens = self._get_morph().tokenize(text)
return [t["token"] for t in tokens]
def _convert_token_to_id(self, token: str) -> int:
return _stable_hash(token)
def _convert_id_to_token(self, index: int) -> str:
return "" # no inverse mapping without a vocab
def save_vocabulary(
self,
save_directory: str,
filename_prefix: str | None = None,
) -> tuple:
return () # no vocab file
# ββ Main __call__ override ββββββββββββββββββββββββββββββββββββββββββββββββ
def __call__(
self,
text: str | list[str],
return_morphological_tokens: bool = True,
**kwargs: Any,
) -> dict:
"""
Tokenize text and return a dict with standard HuggingFace fields
plus ``morphological_tokens``.
Args:
text: Single string or list of strings.
return_morphological_tokens: Include full morphological dicts.
Returns:
dict with:
input_ids : list[int] or list[list[int]]
attention_mask : list[int] or list[list[int]]
token_type_ids : list[int] or list[list[int]]
morphological_tokens : list[dict] or list[list[dict]]
"""
if isinstance(text, list):
results = [self._encode_single(t, return_morphological_tokens) for t in text]
return {
"input_ids": [r["input_ids"] for r in results],
"attention_mask": [r["attention_mask"] for r in results],
"token_type_ids": [r["token_type_ids"] for r in results],
"morphological_tokens": [r["morphological_tokens"] for r in results],
}
return self._encode_single(text, return_morphological_tokens)
def _encode_single(self, text: str, with_morph: bool) -> dict:
morph = self._get_morph()
tokens = morph.tokenize(text)
input_ids = [_stable_hash(t["token"]) for t in tokens]
attn_mask = [1] * len(tokens)
type_ids = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]
out: dict = {
"input_ids": input_ids,
"attention_mask": attn_mask,
"token_type_ids": type_ids,
}
if with_morph:
out["morphological_tokens"] = tokens
return out
# ββ Convenience helpers βββββββββββββββββββββββββββββββββββββββββββββββββββ
def encode(self, text: str, **kwargs) -> list[int]: # type: ignore[override]
return self._encode_single(text, with_morph=False)["input_ids"]
def decode(self, token_ids: list[int], **kwargs) -> str: # type: ignore[override]
"""Not meaningful without a fixed vocab β returns empty string."""
return ""
def tokenize(self, text: str, **kwargs) -> list[str]:
return self._tokenize(text)
def morphological_tokenize(self, text: str) -> list[dict]:
"""Return full morphological token dicts (main NedoTurkishTokenizer output)."""
return self._get_morph().tokenize(text)
def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:
"""Parallel morphological tokenization."""
return self._get_morph().batch_tokenize(texts, workers=workers)
def stats(self, tokens: list[dict]) -> dict:
"""Compute TR% and other morphological coverage stats."""
return self._get_morph().stats(tokens)
|