turk-tokenizer / turk_tokenizer /_preprocessor.py
nmstech's picture
Initial release: TurkTokenizer v1.0.0 — TR-MMLU 92%
ca41c16 verified
"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""
from __future__ import annotations
import re
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
KNOWN_TURKISH_BASES = {
"istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
"cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
"temmuz", "ocak", "şubat", "mart", "nisan", "mayıs", "haziran",
"ağustos", "eylül", "ekim", "kasım", "aralık",
}
KNOWN_FOREIGN_BASES = {
"python", "zoom", "google", "github", "twitter", "youtube",
"instagram", "linkedin", "facebook", "whatsapp", "telegram",
"numpy", "pandas", "django", "flask", "react", "javascript",
"typescript", "docker", "linux", "windows", "android", "iphone",
"chatgpt", "openai", "claude", "gemini", "llama", "bert",
"excel", "powerpoint", "outlook", "teams", "slack", "notion",
"spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
}
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
[
"nın","nin","nun","nün","dan","den","tan","ten",
"da","de","ta","te","ya","ye","nda","nde",
"yı","yi","yu","yü","nı","ni","nu","nü",
"lar","ler","lara","lere","ları","leri",
"ım","im","um","üm","ın","in","un","ün",
"mız","miz","muz","müz","nız","niz","nuz","nüz",
"dır","dir","dur","dür","tır","tir","tur","tür",
"ki","li","lı","lu","lü","sız","siz","suz","süz",
"a","e","ı","i","u","ü",
],
key=len,
reverse=True,
)
_APO_SEP = "\ue001"
_APO_RE = re.compile(
r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
)
_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
def _is_turkish_base(word: str) -> bool:
w = word.lower()
if w in KNOWN_FOREIGN_BASES:
return False
if any(c in TR_CHARS for c in word):
return True
if w in KNOWN_TURKISH_BASES:
return True
if len(w) < 4:
return True
return False
# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────
def _fix_all_caps(text: str) -> tuple[str, set]:
caps: set[str] = set()
def _replace(m: re.Match) -> str:
w = m.group(1)
caps.add(w.lower())
return w.lower()
return _CAPS_RE.sub(_replace, text), caps
def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
result: list[dict] = []
i = 0
while i < len(tokens):
tok = tokens[i]
raw_low = tok["token"].strip().lower()
if tok["type"] == "ROOT" and raw_low in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append(tok)
i += 1
continue
if tok["type"] == "BPE" and tok["token"].startswith(" "):
combined = raw_low
lookahead = [tok]
j = i + 1
while j < len(tokens):
nt = tokens[j]
if not nt["token"].startswith(" "):
combined += nt["token"].strip().lower()
lookahead.append(nt)
j += 1
if combined in caps:
break
if len(combined) > 8:
break
else:
break
if combined in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append({"token": f" {combined}", "type": "ROOT",
"_acronym": True, "_caps": True})
i = j
continue
result.append(tok)
i += 1
return result
# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
def _split_apostrophe(text: str) -> str:
def _repl(m: re.Match) -> str:
base, suffix = m.group(1), m.group(2)
if _is_turkish_base(base):
return m.group(0)
if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
return f"{base} {_APO_SEP} {suffix}"
return m.group(0)
return _APO_RE.sub(_repl, text)
def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
result: list[dict] = []
i = 0
while i < len(tokens):
tok = tokens[i]
if _APO_SEP in tok["token"].strip():
if result:
result[-1]["type"] = "ROOT"
result[-1]["_foreign"] = True
i += 1
if i < len(tokens):
tokens[i]["type"] = "SUFFIX"
tokens[i]["_apo_suffix"] = True
result.append(tokens[i])
i += 1
else:
result.append(tok)
i += 1
return result
# ── Combined pre / post ───────────────────────────────────────────────────────
def preprocess(text: str) -> tuple[str, set]:
text, caps = _fix_all_caps(text)
text = _split_apostrophe(text)
return text, caps
def postprocess(tokens: list[dict], caps: set) -> list[dict]:
tokens = _restore_caps_tokens(tokens, caps)
tokens = _merge_apostrophe_tokens(tokens)
return tokens