File size: 8,860 Bytes
a0e8f24 8f794ec a0e8f24 8f794ec 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 183e656 8f794ec 183e656 8f794ec a0e8f24 183e656 8f794ec a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 6330193 a0e8f24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 | """Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""
from __future__ import annotations
import re
from pathlib import Path
TR_CHARS = set("Γ§ΔΔ±ΕΓΆΓΌΓΔΔ°ΕΓΓ")
_PROPER_NOUNS: set[str] | None = None
def _load_proper_nouns() -> set[str]:
global _PROPER_NOUNS
if _PROPER_NOUNS is not None:
return _PROPER_NOUNS
path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
if path.exists():
_PROPER_NOUNS = {
line.strip().lower()
for line in path.read_text(encoding="utf-8").splitlines()
if line.strip() and not line.startswith("#")
}
else:
_PROPER_NOUNS = set()
return _PROPER_NOUNS
def _turkish_lower(s: str) -> str:
"""Turkish-aware lowercase: Δ°βi, IβΔ± (not i), then standard lower."""
return s.replace("Δ°", "i").replace("I", "Δ±").lower()
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
[
"nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten",
"da","de","ta","te","ya","ye","nda","nde",
"yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ",
"lar","ler","lara","lere","larΔ±","leri",
"Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn",
"mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz",
"dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr",
"ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz",
"a","e","Δ±","i","u","ΓΌ",
],
key=len,
reverse=True,
)
_APO_RE = re.compile(
r"([A-Za-zΓΓ§ΔΔΔ°Δ±ΓΓΆΕΕΓΓΌ0-9]{2,})['\u2019]([A-Za-zΓΓ§ΔΔΔ°Δ±ΓΓΆΕΕΓΓΌ]{1,6})\b"
)
_CAPS_RE = re.compile(r'\b([A-ZΓΔΔ°ΓΕΓ]{2,})\b')
def _is_turkish_base(word: str) -> bool:
"""Return True if the word should be treated as Turkish (don't split apostrophe)."""
wl = _turkish_lower(word)
# Fast path: Turkish-specific characters β definitely Turkish
if any(c in TR_CHARS for c in wl):
return True
# Turkish proper nouns (cities, regions) β not in TDK common-word list
if wl in _load_proper_nouns():
return True
# TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
from ._tdk_vocab import load_tdk_words # noqa: PLC0415
tdk = load_tdk_words()
if tdk and wl in tdk:
return True
# Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmirβ¦)
try:
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
if ZEMBEREK_AVAILABLE and _morphology:
for analysis in _morphology.analyze(wl):
lemma = str(analysis).split("]")[0].lstrip("[")
if any(c in TR_CHARS for c in lemma):
return True
except Exception: # noqa: BLE001
pass
# TDK unavailable + Zemberek unavailable: very short words are ambiguous
return len(wl) < 4
# ββ Fix 1: ALL CAPS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _fix_all_caps(text: str) -> tuple[str, set]:
caps: set[str] = set()
def _replace(m: re.Match) -> str:
w = m.group(1)
caps.add(_turkish_lower(w))
return _turkish_lower(w)
return _CAPS_RE.sub(_replace, text), caps
def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
result: list[dict] = []
i = 0
while i < len(tokens):
tok = tokens[i]
raw_low = _turkish_lower(tok["token"].strip())
if tok["type"] == "ROOT" and raw_low in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append(tok)
i += 1
continue
if tok["type"] == "BPE" and tok["token"].startswith(" "):
combined = raw_low
lookahead = [tok]
j = i + 1
while j < len(tokens):
nt = tokens[j]
if not nt["token"].startswith(" "):
combined += _turkish_lower(nt["token"].strip())
lookahead.append(nt)
j += 1
if combined in caps:
break
if len(combined) > 8:
break
else:
break
if combined in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append({"token": f" {combined}", "type": "ROOT",
"_acronym": True, "_caps": True})
i = j
continue
result.append(tok)
i += 1
return result
# ββ Fix 2: Apostrophe split βββββββββββββββββββββββββββββββββββββββββββββββββββ
#
# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
# then marks the following word-initial suffix token as SUFFIX.
#
# Old approach used a \ue001 separator β the base tokenizer converts that to
# '<unknown>' so the separator was never found. Simple-space + pair-list is
# robust regardless of how the tokenizer handles the input.
def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
"""
Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe β space).
Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
Turkish proper names (Δ°stanbul'da) are left unchanged.
"""
splits: list[tuple[str, str]] = []
def _repl(m: re.Match) -> str:
base, suffix = m.group(1), m.group(2)
if _is_turkish_base(base):
return m.group(0) # leave Turkish names alone
sl = suffix.lower()
if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
splits.append((_turkish_lower(base), sl))
return f"{base} {suffix}" # just drop the apostrophe
return m.group(0)
return _APO_RE.sub(_repl, text), splits
def _merge_apostrophe_tokens(
tokens: list[dict], apo_splits: list[tuple[str, str]]
) -> list[dict]:
"""
For each (foreign_base, suffix) pair recorded during _split_apostrophe,
find the consecutive BPE/ROOT pieces that together spell foreign_base,
merge them into one FOREIGN ROOT token, and mark the next word-initial
token whose stripped form == suffix as SUFFIX.
"""
if not apo_splits:
return tokens
result = list(tokens)
for foreign_base, suffix in apo_splits:
n = len(result)
for j in range(1, n):
tok_j = result[j]
# Candidate suffix token: word-initial, stripped == suffix
if not tok_j["token"].startswith(" "):
continue
if _turkish_lower(tok_j["token"].strip()) != suffix:
continue
# Walk back to find pieces of the word before j (no leading space)
word_start = j - 1
while word_start > 0 and not result[word_start]["token"].startswith(" "):
word_start -= 1
pieces = result[word_start:j]
if not pieces:
continue
combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
if combined != foreign_base:
continue
# Merge pieces into one FOREIGN ROOT
merged = pieces[0]["token"] # keeps leading space
for p in pieces[1:]:
merged += p["token"].strip()
new_root = {"token": merged, "type": "ROOT", "_foreign": True}
new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}
result = (
result[:word_start]
+ [new_root, new_suf]
+ result[j + 1:]
)
break # this pair is handled
return result
# ββ Combined pre / post βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def preprocess(text: str) -> tuple[str, set, list]:
"""Prepare text before base tokenization.
Returns:
(modified_text, caps_set, apo_splits)
"""
text, caps = _fix_all_caps(text)
text, apo_splits = _split_apostrophe(text)
return text, caps, apo_splits
def postprocess(
tokens: list[dict], caps: set, apo_splits: list | None = None
) -> list[dict]:
"""Fix tokens after base tokenization."""
tokens = _restore_caps_tokens(tokens, caps)
tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
return tokens
|