Ethosoft
/

NedoTurkishTokenizer

@@ -6,6 +6,12 @@ import re
 TR_CHARS = set("çğışöüÇĞİŞÖÜ")
 KNOWN_TURKISH_BASES = {
     "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
     "cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
@@ -21,6 +27,7 @@ KNOWN_FOREIGN_BASES = {
     "chatgpt", "openai", "claude", "gemini", "llama", "bert",
     "excel", "powerpoint", "outlook", "teams", "slack", "notion",
     "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
 }
 TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
@@ -39,11 +46,10 @@ TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
     reverse=True,
 )
-_APO_SEP   = "\ue001"
-_APO_RE    = re.compile(
     r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
 )
-_CAPS_RE   = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
 def _is_turkish_base(word: str) -> bool:
@@ -66,8 +72,8 @@ def _fix_all_caps(text: str) -> tuple[str, set]:
     def _replace(m: re.Match) -> str:
         w = m.group(1)
-        caps.add(w.lower())
-        return w.lower()
     return _CAPS_RE.sub(_replace, text), caps
@@ -77,7 +83,7 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
     i = 0
     while i < len(tokens):
         tok = tokens[i]
-        raw_low = tok["token"].strip().lower()
         if tok["type"] == "ROOT" and raw_low in caps:
             result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
@@ -92,7 +98,7 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
             while j < len(tokens):
                 nt = tokens[j]
                 if not nt["token"].startswith(" "):
-                    combined += nt["token"].strip().lower()
                     lookahead.append(nt)
                     j += 1
                     if combined in caps:
@@ -115,49 +121,109 @@ def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
 # ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
-def _split_apostrophe(text: str) -> str:
     def _repl(m: re.Match) -> str:
         base, suffix = m.group(1), m.group(2)
         if _is_turkish_base(base):
-            return m.group(0)
-        if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
-            return f"{base} {_APO_SEP} {suffix}"
         return m.group(0)
-    return _APO_RE.sub(_repl, text)
-def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
-    result: list[dict] = []
-    i = 0
-    while i < len(tokens):
-        tok = tokens[i]
-        if _APO_SEP in tok["token"].strip():
-            if result:
-                result[-1]["type"]     = "ROOT"
-                result[-1]["_foreign"] = True
-            i += 1
-            if i < len(tokens):
-                tokens[i]["type"]       = "SUFFIX"
-                tokens[i]["_apo_suffix"] = True
-                result.append(tokens[i])
-                i += 1
-        else:
-            result.append(tok)
-            i += 1
     return result
 # ── Combined pre / post ───────────────────────────────────────────────────────
-def preprocess(text: str) -> tuple[str, set]:
     text, caps = _fix_all_caps(text)
-    text = _split_apostrophe(text)
-    return text, caps
-def postprocess(tokens: list[dict], caps: set) -> list[dict]:
     tokens = _restore_caps_tokens(tokens, caps)
-    tokens = _merge_apostrophe_tokens(tokens)
     return tokens

 TR_CHARS = set("çğışöüÇĞİŞÖÜ")
+def _turkish_lower(s: str) -> str:
+    """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
+    return s.replace("İ", "i").replace("I", "ı").lower()
 KNOWN_TURKISH_BASES = {
     "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
     "cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
     "chatgpt", "openai", "claude", "gemini", "llama", "bert",
     "excel", "powerpoint", "outlook", "teams", "slack", "notion",
     "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
+    "meeting", "tweet", "zoom", "email", "video",
 }
 TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
     reverse=True,
 )
+_APO_RE  = re.compile(
     r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
 )
+_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
 def _is_turkish_base(word: str) -> bool:
     def _replace(m: re.Match) -> str:
         w = m.group(1)
+        caps.add(_turkish_lower(w))
+        return _turkish_lower(w)
     return _CAPS_RE.sub(_replace, text), caps
     i = 0
     while i < len(tokens):
         tok = tokens[i]
+        raw_low = _turkish_lower(tok["token"].strip())
         if tok["type"] == "ROOT" and raw_low in caps:
             result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
             while j < len(tokens):
                 nt = tokens[j]
                 if not nt["token"].startswith(" "):
+                    combined += _turkish_lower(nt["token"].strip())
                     lookahead.append(nt)
                     j += 1
                     if combined in caps:
 # ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
+#
+# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
+# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
+# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
+# then marks the following word-initial suffix token as SUFFIX.
+#
+# Old approach used a \ue001 separator — the base tokenizer converts that to
+# '<unknown>' so the separator was never found. Simple-space + pair-list is
+# robust regardless of how the tokenizer handles the input.
+def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
+    """
+    Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space).
+    Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
+    Turkish proper names (İstanbul'da) are left unchanged.
+    """
+    splits: list[tuple[str, str]] = []
     def _repl(m: re.Match) -> str:
         base, suffix = m.group(1), m.group(2)
         if _is_turkish_base(base):
+            return m.group(0)          # leave Turkish names alone
+        sl = suffix.lower()
+        if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
+            splits.append((_turkish_lower(base), sl))
+            return f"{base} {suffix}"  # just drop the apostrophe
         return m.group(0)
+    return _APO_RE.sub(_repl, text), splits
+def _merge_apostrophe_tokens(
+    tokens: list[dict], apo_splits: list[tuple[str, str]]
+) -> list[dict]:
+    """
+    For each (foreign_base, suffix) pair recorded during _split_apostrophe,
+    find the consecutive BPE/ROOT pieces that together spell foreign_base,
+    merge them into one FOREIGN ROOT token, and mark the next word-initial
+    token whose stripped form == suffix as SUFFIX.
+    """
+    if not apo_splits:
+        return tokens
+    result = list(tokens)
+    for foreign_base, suffix in apo_splits:
+        n = len(result)
+        for j in range(1, n):
+            tok_j = result[j]
+            # Candidate suffix token: word-initial, stripped == suffix
+            if not tok_j["token"].startswith(" "):
+                continue
+            if _turkish_lower(tok_j["token"].strip()) != suffix:
+                continue
+            # Walk back to find pieces of the word before j (no leading space)
+            word_start = j - 1
+            while word_start > 0 and not result[word_start]["token"].startswith(" "):
+                word_start -= 1
+            pieces = result[word_start:j]
+            if not pieces:
+                continue
+            combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
+            if combined != foreign_base:
+                continue
+            # Merge pieces into one FOREIGN ROOT
+            merged = pieces[0]["token"]        # keeps leading space
+            for p in pieces[1:]:
+                merged += p["token"].strip()
+            new_root = {"token": merged, "type": "ROOT", "_foreign": True}
+            new_suf  = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}
+            result = (
+                result[:word_start]
+                + [new_root, new_suf]
+                + result[j + 1:]
+            )
+            break   # this pair is handled
     return result
 # ── Combined pre / post ───────────────────────────────────────────────────────
+def preprocess(text: str) -> tuple[str, set, list]:
+    """Prepare text before base tokenization.
+    Returns:
+        (modified_text, caps_set, apo_splits)
+    """
     text, caps = _fix_all_caps(text)
+    text, apo_splits = _split_apostrophe(text)
+    return text, caps, apo_splits
+def postprocess(
+    tokens: list[dict], caps: set, apo_splits: list | None = None
+) -> list[dict]:
+    """Fix tokens after base tokenization."""
     tokens = _restore_caps_tokens(tokens, caps)
+    tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
     return tokens