nmstech's picture
Rename project from TurkTokenizer to NedoTurkishTokenizer
cfffd93
"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""
from __future__ import annotations
import re
from pathlib import Path
TR_CHARS = set("Γ§ΔŸΔ±ΕŸΓΆΓΌΓ‡ΔžΔ°ΕžΓ–Γœ")
_PROPER_NOUNS: set[str] | None = None
def _load_proper_nouns() -> set[str]:
global _PROPER_NOUNS
if _PROPER_NOUNS is not None:
return _PROPER_NOUNS
path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
if path.exists():
_PROPER_NOUNS = {
line.strip().lower()
for line in path.read_text(encoding="utf-8").splitlines()
if line.strip() and not line.startswith("#")
}
else:
_PROPER_NOUNS = set()
return _PROPER_NOUNS
def _turkish_lower(s: str) -> str:
"""Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
return s.replace("Δ°", "i").replace("I", "Δ±").lower()
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
[
"nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten",
"da","de","ta","te","ya","ye","nda","nde",
"yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ",
"lar","ler","lara","lere","larΔ±","leri",
"Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn",
"mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz",
"dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr",
"ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz",
"a","e","Δ±","i","u","ΓΌ",
],
key=len,
reverse=True,
)
_APO_RE = re.compile(
r"([A-Za-zΓ‡Γ§ΔžΔŸΔ°Δ±Γ–ΓΆΕžΕŸΓœΓΌ0-9]{2,})['\u2019]([A-Za-zΓ‡Γ§ΔžΔŸΔ°Δ±Γ–ΓΆΕžΕŸΓœΓΌ]{1,6})\b"
)
_CAPS_RE = re.compile(r'\b([A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,})\b')
def _is_turkish_base(word: str) -> bool:
"""Return True if the word should be treated as Turkish (don't split apostrophe)."""
wl = _turkish_lower(word)
# Fast path: Turkish-specific characters β†’ definitely Turkish
if any(c in TR_CHARS for c in wl):
return True
# Turkish proper nouns (cities, regions) β€” not in TDK common-word list
if wl in _load_proper_nouns():
return True
# TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
from ._tdk_vocab import load_tdk_words # noqa: PLC0415
tdk = load_tdk_words()
if tdk and wl in tdk:
return True
# Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmir…)
try:
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
if ZEMBEREK_AVAILABLE and _morphology:
for analysis in _morphology.analyze(wl):
lemma = str(analysis).split("]")[0].lstrip("[")
if any(c in TR_CHARS for c in lemma):
return True
except Exception: # noqa: BLE001
pass
# TDK unavailable + Zemberek unavailable: very short words are ambiguous
return len(wl) < 4
# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────
def _fix_all_caps(text: str) -> tuple[str, set]:
caps: set[str] = set()
def _replace(m: re.Match) -> str:
w = m.group(1)
caps.add(_turkish_lower(w))
return _turkish_lower(w)
return _CAPS_RE.sub(_replace, text), caps
def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
result: list[dict] = []
i = 0
while i < len(tokens):
tok = tokens[i]
raw_low = _turkish_lower(tok["token"].strip())
if tok["type"] == "ROOT" and raw_low in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append(tok)
i += 1
continue
if tok["type"] == "BPE" and tok["token"].startswith(" "):
combined = raw_low
lookahead = [tok]
j = i + 1
while j < len(tokens):
nt = tokens[j]
if not nt["token"].startswith(" "):
combined += _turkish_lower(nt["token"].strip())
lookahead.append(nt)
j += 1
if combined in caps:
break
if len(combined) > 8:
break
else:
break
if combined in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append({"token": f" {combined}", "type": "ROOT",
"_acronym": True, "_caps": True})
i = j
continue
result.append(tok)
i += 1
return result
# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
#
# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
# then marks the following word-initial suffix token as SUFFIX.
#
# Old approach used a \ue001 separator β€” the base tokenizer converts that to
# '<unknown>' so the separator was never found. Simple-space + pair-list is
# robust regardless of how the tokenizer handles the input.
def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
"""
Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe β†’ space).
Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
Turkish proper names (Δ°stanbul'da) are left unchanged.
"""
splits: list[tuple[str, str]] = []
def _repl(m: re.Match) -> str:
base, suffix = m.group(1), m.group(2)
if _is_turkish_base(base):
return m.group(0) # leave Turkish names alone
sl = suffix.lower()
if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
splits.append((_turkish_lower(base), sl))
return f"{base} {suffix}" # just drop the apostrophe
return m.group(0)
return _APO_RE.sub(_repl, text), splits
def _merge_apostrophe_tokens(
tokens: list[dict], apo_splits: list[tuple[str, str]]
) -> list[dict]:
"""
For each (foreign_base, suffix) pair recorded during _split_apostrophe,
find the consecutive BPE/ROOT pieces that together spell foreign_base,
merge them into one FOREIGN ROOT token, and mark the next word-initial
token whose stripped form == suffix as SUFFIX.
"""
if not apo_splits:
return tokens
result = list(tokens)
for foreign_base, suffix in apo_splits:
n = len(result)
for j in range(1, n):
tok_j = result[j]
# Candidate suffix token: word-initial, stripped == suffix
if not tok_j["token"].startswith(" "):
continue
if _turkish_lower(tok_j["token"].strip()) != suffix:
continue
# Walk back to find pieces of the word before j (no leading space)
word_start = j - 1
while word_start > 0 and not result[word_start]["token"].startswith(" "):
word_start -= 1
pieces = result[word_start:j]
if not pieces:
continue
combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
if combined != foreign_base:
continue
# Merge pieces into one FOREIGN ROOT
merged = pieces[0]["token"] # keeps leading space
for p in pieces[1:]:
merged += p["token"].strip()
new_root = {"token": merged, "type": "ROOT", "_foreign": True}
new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}
result = (
result[:word_start]
+ [new_root, new_suf]
+ result[j + 1:]
)
break # this pair is handled
return result
# ── Combined pre / post ───────────────────────────────────────────────────────
def preprocess(text: str) -> tuple[str, set, list]:
"""Prepare text before base tokenization.
Returns:
(modified_text, caps_set, apo_splits)
"""
text, caps = _fix_all_caps(text)
text, apo_splits = _split_apostrophe(text)
return text, caps, apo_splits
def postprocess(
tokens: list[dict], caps: set, apo_splits: list | None = None
) -> list[dict]:
"""Fix tokens after base tokenization."""
tokens = _restore_caps_tokens(tokens, caps)
tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
return tokens