nmstech commited on
Commit
fffa764
Β·
verified Β·
1 Parent(s): a0e8f24

Add AutoTokenizer support (trust_remote_code)

Browse files
Files changed (1) hide show
  1. tokenization_turk.py +172 -0
tokenization_turk.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TurkTokenizer β€” HuggingFace AutoTokenizer compatible class.
3
+
4
+ Usage:
5
+ from transformers import AutoTokenizer
6
+
7
+ tok = AutoTokenizer.from_pretrained("Ethosoft/turk-tokenizer", trust_remote_code=True)
8
+ out = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m")
9
+
10
+ out["input_ids"] # hash-stable int IDs of morphological tokens
11
+ out["attention_mask"] # all 1s
12
+ out["token_type_ids"] # 0=root/other, 1=suffix
13
+ out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import hashlib
19
+ from typing import Any
20
+
21
+ from transformers import PreTrainedTokenizer
22
+
23
+ # ── Morphological type β†’ token_type_id ───────────────────────────────────────
24
+ _MTYPE_ID = {
25
+ "ROOT": 0,
26
+ "FOREIGN": 0,
27
+ "SUFFIX": 1,
28
+ "BPE": 2,
29
+ "PUNCT": 3,
30
+ "NUM": 4,
31
+ "DATE": 4,
32
+ "UNIT": 4,
33
+ "URL": 5,
34
+ "MENTION": 5,
35
+ "HASHTAG": 5,
36
+ "EMOJI": 5,
37
+ }
38
+
39
+
40
+ def _stable_hash(s: str) -> int:
41
+ """MD5-based stable hash that does NOT change between Python runs."""
42
+ return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
43
+
44
+
45
+ class TurkTokenizer(PreTrainedTokenizer):
46
+ """
47
+ Turkish morphological tokenizer β€” HuggingFace compatible.
48
+
49
+ ``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
50
+ For downstream transformer use, embed by ``token_type_ids`` or learn a
51
+ projection from the ``morphological_tokens`` metadata.
52
+
53
+ All standard HuggingFace fields are present:
54
+ input_ids, attention_mask, token_type_ids
55
+
56
+ Extra field:
57
+ morphological_tokens β€” list[dict] with token, token_type, morph_pos, ...
58
+ """
59
+
60
+ vocab_files_names: dict = {}
61
+ model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
62
+
63
+ def __init__(self, **kwargs: Any) -> None:
64
+ super().__init__(**kwargs)
65
+ self._morph: "TurkTokenizer_core | None" = None # lazy init
66
+
67
+ def _get_morph(self):
68
+ if self._morph is None:
69
+ from turk_tokenizer import TurkTokenizer as _Core # noqa: PLC0415
70
+ self._morph = _Core()
71
+ return self._morph
72
+
73
+ # ── PreTrainedTokenizer required interface ────────────────────────────────
74
+
75
+ @property
76
+ def vocab_size(self) -> int:
77
+ return 16_777_216 # 2^24 β€” MD5 hash space
78
+
79
+ def get_vocab(self) -> dict:
80
+ return {} # no fixed vocabulary
81
+
82
+ def _tokenize(self, text: str) -> list[str]:
83
+ """Return token strings from the morphological pipeline."""
84
+ tokens = self._get_morph().tokenize(text)
85
+ return [t["token"] for t in tokens]
86
+
87
+ def _convert_token_to_id(self, token: str) -> int:
88
+ return _stable_hash(token)
89
+
90
+ def _convert_id_to_token(self, index: int) -> str:
91
+ return "" # no inverse mapping without a vocab
92
+
93
+ def save_vocabulary(
94
+ self,
95
+ save_directory: str,
96
+ filename_prefix: str | None = None,
97
+ ) -> tuple:
98
+ return () # no vocab file
99
+
100
+ # ── Main __call__ override ────────────────────────────────────────────────
101
+
102
+ def __call__(
103
+ self,
104
+ text: str | list[str],
105
+ return_morphological_tokens: bool = True,
106
+ **kwargs: Any,
107
+ ) -> dict:
108
+ """
109
+ Tokenize text and return a dict with standard HuggingFace fields
110
+ plus ``morphological_tokens``.
111
+
112
+ Args:
113
+ text: Single string or list of strings.
114
+ return_morphological_tokens: Include full morphological dicts.
115
+
116
+ Returns:
117
+ dict with:
118
+ input_ids : list[int] or list[list[int]]
119
+ attention_mask : list[int] or list[list[int]]
120
+ token_type_ids : list[int] or list[list[int]]
121
+ morphological_tokens : list[dict] or list[list[dict]]
122
+ """
123
+ if isinstance(text, list):
124
+ results = [self._encode_single(t, return_morphological_tokens) for t in text]
125
+ return {
126
+ "input_ids": [r["input_ids"] for r in results],
127
+ "attention_mask": [r["attention_mask"] for r in results],
128
+ "token_type_ids": [r["token_type_ids"] for r in results],
129
+ "morphological_tokens": [r["morphological_tokens"] for r in results],
130
+ }
131
+ return self._encode_single(text, return_morphological_tokens)
132
+
133
+ def _encode_single(self, text: str, with_morph: bool) -> dict:
134
+ morph = self._get_morph()
135
+ tokens = morph.tokenize(text)
136
+
137
+ input_ids = [_stable_hash(t["token"]) for t in tokens]
138
+ attn_mask = [1] * len(tokens)
139
+ type_ids = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]
140
+
141
+ out: dict = {
142
+ "input_ids": input_ids,
143
+ "attention_mask": attn_mask,
144
+ "token_type_ids": type_ids,
145
+ }
146
+ if with_morph:
147
+ out["morphological_tokens"] = tokens
148
+ return out
149
+
150
+ # ── Convenience helpers ───────────────────────────────────────────────────
151
+
152
+ def encode(self, text: str, **kwargs) -> list[int]: # type: ignore[override]
153
+ return self._encode_single(text, with_morph=False)["input_ids"]
154
+
155
+ def decode(self, token_ids: list[int], **kwargs) -> str: # type: ignore[override]
156
+ """Not meaningful without a fixed vocab β€” returns empty string."""
157
+ return ""
158
+
159
+ def tokenize(self, text: str, **kwargs) -> list[str]:
160
+ return self._tokenize(text)
161
+
162
+ def morphological_tokenize(self, text: str) -> list[dict]:
163
+ """Return full morphological token dicts (main TurkTokenizer output)."""
164
+ return self._get_morph().tokenize(text)
165
+
166
+ def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:
167
+ """Parallel morphological tokenization."""
168
+ return self._get_morph().batch_tokenize(texts, workers=workers)
169
+
170
+ def stats(self, tokens: list[dict]) -> dict:
171
+ """Compute TR% and other morphological coverage stats."""
172
+ return self._get_morph().stats(tokens)