Fix İ lowercase bug + apostrophe merge for BPE-split foreign words
Browse files- turk_tokenizer/_preprocessor.py +101 -35
turk_tokenizer/_preprocessor.py
CHANGED
|
@@ -6,6 +6,12 @@ import re
|
|
| 6 |
|
| 7 |
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
KNOWN_TURKISH_BASES = {
|
| 10 |
"istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
|
| 11 |
"cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
|
|
@@ -21,6 +27,7 @@ KNOWN_FOREIGN_BASES = {
|
|
| 21 |
"chatgpt", "openai", "claude", "gemini", "llama", "bert",
|
| 22 |
"excel", "powerpoint", "outlook", "teams", "slack", "notion",
|
| 23 |
"spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
|
|
|
|
| 24 |
}
|
| 25 |
|
| 26 |
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
|
|
@@ -39,11 +46,10 @@ TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
|
|
| 39 |
reverse=True,
|
| 40 |
)
|
| 41 |
|
| 42 |
-
|
| 43 |
-
_APO_RE = re.compile(
|
| 44 |
r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
|
| 45 |
)
|
| 46 |
-
_CAPS_RE
|
| 47 |
|
| 48 |
|
| 49 |
def _is_turkish_base(word: str) -> bool:
|
|
@@ -66,8 +72,8 @@ def _fix_all_caps(text: str) -> tuple[str, set]:
|
|
| 66 |
|
| 67 |
def _replace(m: re.Match) -> str:
|
| 68 |
w = m.group(1)
|
| 69 |
-
caps.add(
|
| 70 |
-
return
|
| 71 |
|
| 72 |
return _CAPS_RE.sub(_replace, text), caps
|
| 73 |
|
|
@@ -77,7 +83,7 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
|
|
| 77 |
i = 0
|
| 78 |
while i < len(tokens):
|
| 79 |
tok = tokens[i]
|
| 80 |
-
raw_low = tok["token"].strip()
|
| 81 |
|
| 82 |
if tok["type"] == "ROOT" and raw_low in caps:
|
| 83 |
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
|
|
@@ -92,7 +98,7 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
|
|
| 92 |
while j < len(tokens):
|
| 93 |
nt = tokens[j]
|
| 94 |
if not nt["token"].startswith(" "):
|
| 95 |
-
combined += nt["token"].strip()
|
| 96 |
lookahead.append(nt)
|
| 97 |
j += 1
|
| 98 |
if combined in caps:
|
|
@@ -115,49 +121,109 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
|
|
| 115 |
|
| 116 |
|
| 117 |
# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
-
def _split_apostrophe(text: str) -> str:
|
| 120 |
def _repl(m: re.Match) -> str:
|
| 121 |
base, suffix = m.group(1), m.group(2)
|
| 122 |
if _is_turkish_base(base):
|
| 123 |
-
return m.group(0)
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
| 126 |
return m.group(0)
|
| 127 |
|
| 128 |
-
return _APO_RE.sub(_repl, text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
|
| 132 |
-
result: list[dict] = []
|
| 133 |
-
i = 0
|
| 134 |
-
while i < len(tokens):
|
| 135 |
-
tok = tokens[i]
|
| 136 |
-
if _APO_SEP in tok["token"].strip():
|
| 137 |
-
if result:
|
| 138 |
-
result[-1]["type"] = "ROOT"
|
| 139 |
-
result[-1]["_foreign"] = True
|
| 140 |
-
i += 1
|
| 141 |
-
if i < len(tokens):
|
| 142 |
-
tokens[i]["type"] = "SUFFIX"
|
| 143 |
-
tokens[i]["_apo_suffix"] = True
|
| 144 |
-
result.append(tokens[i])
|
| 145 |
-
i += 1
|
| 146 |
-
else:
|
| 147 |
-
result.append(tok)
|
| 148 |
-
i += 1
|
| 149 |
return result
|
| 150 |
|
| 151 |
|
| 152 |
# ── Combined pre / post ───────────────────────────────────────────────────────
|
| 153 |
|
| 154 |
-
def preprocess(text: str) -> tuple[str, set]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
text, caps = _fix_all_caps(text)
|
| 156 |
-
text = _split_apostrophe(text)
|
| 157 |
-
return text, caps
|
| 158 |
|
| 159 |
|
| 160 |
-
def postprocess(
|
|
|
|
|
|
|
|
|
|
| 161 |
tokens = _restore_caps_tokens(tokens, caps)
|
| 162 |
-
tokens = _merge_apostrophe_tokens(tokens)
|
| 163 |
return tokens
|
|
|
|
| 6 |
|
| 7 |
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 8 |
|
| 9 |
+
|
| 10 |
+
def _turkish_lower(s: str) -> str:
|
| 11 |
+
"""Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
|
| 12 |
+
return s.replace("İ", "i").replace("I", "ı").lower()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
KNOWN_TURKISH_BASES = {
|
| 16 |
"istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
|
| 17 |
"cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
|
|
|
|
| 27 |
"chatgpt", "openai", "claude", "gemini", "llama", "bert",
|
| 28 |
"excel", "powerpoint", "outlook", "teams", "slack", "notion",
|
| 29 |
"spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
|
| 30 |
+
"meeting", "tweet", "zoom", "email", "video",
|
| 31 |
}
|
| 32 |
|
| 33 |
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
|
|
|
|
| 46 |
reverse=True,
|
| 47 |
)
|
| 48 |
|
| 49 |
+
_APO_RE = re.compile(
|
|
|
|
| 50 |
r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
|
| 51 |
)
|
| 52 |
+
_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
|
| 53 |
|
| 54 |
|
| 55 |
def _is_turkish_base(word: str) -> bool:
|
|
|
|
| 72 |
|
| 73 |
def _replace(m: re.Match) -> str:
|
| 74 |
w = m.group(1)
|
| 75 |
+
caps.add(_turkish_lower(w))
|
| 76 |
+
return _turkish_lower(w)
|
| 77 |
|
| 78 |
return _CAPS_RE.sub(_replace, text), caps
|
| 79 |
|
|
|
|
| 83 |
i = 0
|
| 84 |
while i < len(tokens):
|
| 85 |
tok = tokens[i]
|
| 86 |
+
raw_low = _turkish_lower(tok["token"].strip())
|
| 87 |
|
| 88 |
if tok["type"] == "ROOT" and raw_low in caps:
|
| 89 |
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
|
|
|
|
| 98 |
while j < len(tokens):
|
| 99 |
nt = tokens[j]
|
| 100 |
if not nt["token"].startswith(" "):
|
| 101 |
+
combined += _turkish_lower(nt["token"].strip())
|
| 102 |
lookahead.append(nt)
|
| 103 |
j += 1
|
| 104 |
if combined in caps:
|
|
|
|
| 121 |
|
| 122 |
|
| 123 |
# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
|
| 124 |
+
#
|
| 125 |
+
# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
|
| 126 |
+
# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
|
| 127 |
+
# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
|
| 128 |
+
# then marks the following word-initial suffix token as SUFFIX.
|
| 129 |
+
#
|
| 130 |
+
# Old approach used a \ue001 separator — the base tokenizer converts that to
|
| 131 |
+
# '<unknown>' so the separator was never found. Simple-space + pair-list is
|
| 132 |
+
# robust regardless of how the tokenizer handles the input.
|
| 133 |
+
|
| 134 |
+
def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
|
| 135 |
+
"""
|
| 136 |
+
Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space).
|
| 137 |
+
Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
|
| 138 |
+
Turkish proper names (İstanbul'da) are left unchanged.
|
| 139 |
+
"""
|
| 140 |
+
splits: list[tuple[str, str]] = []
|
| 141 |
|
|
|
|
| 142 |
def _repl(m: re.Match) -> str:
|
| 143 |
base, suffix = m.group(1), m.group(2)
|
| 144 |
if _is_turkish_base(base):
|
| 145 |
+
return m.group(0) # leave Turkish names alone
|
| 146 |
+
sl = suffix.lower()
|
| 147 |
+
if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
|
| 148 |
+
splits.append((_turkish_lower(base), sl))
|
| 149 |
+
return f"{base} {suffix}" # just drop the apostrophe
|
| 150 |
return m.group(0)
|
| 151 |
|
| 152 |
+
return _APO_RE.sub(_repl, text), splits
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _merge_apostrophe_tokens(
|
| 156 |
+
tokens: list[dict], apo_splits: list[tuple[str, str]]
|
| 157 |
+
) -> list[dict]:
|
| 158 |
+
"""
|
| 159 |
+
For each (foreign_base, suffix) pair recorded during _split_apostrophe,
|
| 160 |
+
find the consecutive BPE/ROOT pieces that together spell foreign_base,
|
| 161 |
+
merge them into one FOREIGN ROOT token, and mark the next word-initial
|
| 162 |
+
token whose stripped form == suffix as SUFFIX.
|
| 163 |
+
"""
|
| 164 |
+
if not apo_splits:
|
| 165 |
+
return tokens
|
| 166 |
+
|
| 167 |
+
result = list(tokens)
|
| 168 |
+
|
| 169 |
+
for foreign_base, suffix in apo_splits:
|
| 170 |
+
n = len(result)
|
| 171 |
+
for j in range(1, n):
|
| 172 |
+
tok_j = result[j]
|
| 173 |
+
# Candidate suffix token: word-initial, stripped == suffix
|
| 174 |
+
if not tok_j["token"].startswith(" "):
|
| 175 |
+
continue
|
| 176 |
+
if _turkish_lower(tok_j["token"].strip()) != suffix:
|
| 177 |
+
continue
|
| 178 |
|
| 179 |
+
# Walk back to find pieces of the word before j (no leading space)
|
| 180 |
+
word_start = j - 1
|
| 181 |
+
while word_start > 0 and not result[word_start]["token"].startswith(" "):
|
| 182 |
+
word_start -= 1
|
| 183 |
+
|
| 184 |
+
pieces = result[word_start:j]
|
| 185 |
+
if not pieces:
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
+
combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
|
| 189 |
+
if combined != foreign_base:
|
| 190 |
+
continue
|
| 191 |
+
|
| 192 |
+
# Merge pieces into one FOREIGN ROOT
|
| 193 |
+
merged = pieces[0]["token"] # keeps leading space
|
| 194 |
+
for p in pieces[1:]:
|
| 195 |
+
merged += p["token"].strip()
|
| 196 |
+
|
| 197 |
+
new_root = {"token": merged, "type": "ROOT", "_foreign": True}
|
| 198 |
+
new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}
|
| 199 |
+
|
| 200 |
+
result = (
|
| 201 |
+
result[:word_start]
|
| 202 |
+
+ [new_root, new_suf]
|
| 203 |
+
+ result[j + 1:]
|
| 204 |
+
)
|
| 205 |
+
break # this pair is handled
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
return result
|
| 208 |
|
| 209 |
|
| 210 |
# ── Combined pre / post ───────────────────────────────────────────────────────
|
| 211 |
|
| 212 |
+
def preprocess(text: str) -> tuple[str, set, list]:
|
| 213 |
+
"""Prepare text before base tokenization.
|
| 214 |
+
|
| 215 |
+
Returns:
|
| 216 |
+
(modified_text, caps_set, apo_splits)
|
| 217 |
+
"""
|
| 218 |
text, caps = _fix_all_caps(text)
|
| 219 |
+
text, apo_splits = _split_apostrophe(text)
|
| 220 |
+
return text, caps, apo_splits
|
| 221 |
|
| 222 |
|
| 223 |
+
def postprocess(
|
| 224 |
+
tokens: list[dict], caps: set, apo_splits: list | None = None
|
| 225 |
+
) -> list[dict]:
|
| 226 |
+
"""Fix tokens after base tokenization."""
|
| 227 |
tokens = _restore_caps_tokens(tokens, caps)
|
| 228 |
+
tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
|
| 229 |
return tokens
|