Fix broken placeholder mechanism: replace with segment-based tokenization

Browse files

Files changed (4) hide show

.claude/settings.local.json +15 -0
.gitignore +10 -0
turk_tokenizer/_normalizer.py +119 -55
turk_tokenizer/tokenizer.py +31 -16

.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(git add:*)",
+      "Bash(git push:*)",
+      "Bash(git remote:*)",
+      "Bash(git fetch:*)",
+      "Bash(hf whoami:*)",
+      "Bash(huggingface-cli whoami:*)",
+      "Bash(python3:*)",
+      "Bash(git lfs:*)",
+      "Bash(sudo apt-get:*)"
+    ]
+  }
+}

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+dist/
+build/
+.eggs/
+*.egg
+.env
+.venv/

turk_tokenizer/_normalizer.py CHANGED Viewed

@@ -1,4 +1,8 @@
-"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)."""
 from __future__ import annotations
@@ -26,21 +30,54 @@ ROMAN_NUMERALS = {
     "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
 }
 URL_RE         = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
 MENTION_RE     = re.compile(r'@[\w\u00C0-\u024F]+')
 HASHTAG_RE     = re.compile(r'#[\w\u00C0-\u024F]+')
-NUMBER_RE      = re.compile(
-    r'%\d+[\.,]?\d*'
-    r'|\d+[\.,]\d+'
-    r'|\d{1,3}(?:\.\d{3})+'
-    r'|\d+%'
-    r'|\d+/\d+'
 )
 DATE_RE        = re.compile(
     r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
     r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
 )
 CURRENCY_RE    = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
 TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
 UNICODE_EMOJI_RE = re.compile(
     "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
@@ -50,62 +87,89 @@ UNICODE_EMOJI_RE = re.compile(
     flags=re.UNICODE,
 )
-def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]:
-    """Replace special tokens with placeholders before base tokenization."""
-    placeholders: list[dict] = []
-    counter = [0]
-    def _ph(token_type: str, original: str) -> str:
-        ph = f"\x00{token_type}{counter[0]}\x00"
-        placeholders.append({"placeholder": ph, "type": token_type, "original": original})
-        counter[0] += 1
-        return ph
-    def _replace(pattern: re.Pattern, ttype: str, t: str) -> str:
-        return pattern.sub(lambda m: _ph(ttype, m.group(0)), t)
-    text = _replace(URL_RE,            "URL",     text)
-    text = _replace(MENTION_RE,        "MENTION", text)
-    text = _replace(HASHTAG_RE,        "HASHTAG", text)
-    text = _replace(DATE_RE,           "DATE",    text)
-    text = _replace(CURRENCY_RE,       "UNIT",    text)
-    text = _replace(NUMBER_RE,         "NUM",     text)
-    text = _replace(UNICODE_EMOJI_RE,  "EMOJI",   text)
-    text = _replace(TEXT_EMOJI_RE,     "EMOJI",   text)
-    return text, placeholders
-def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]:
-    """Restore placeholders in the token stream."""
-    if not placeholders:
         return tokens
-    ph_map   = {p["placeholder"]: p for p in placeholders}
-    restored: set[str] = set()
-    result: list[dict] = []
-    for tok in tokens:
-        raw = tok["token"]
-        matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None)
-        if matched:
-            ph, info = matched
-            if ph not in restored:
-                restored.add(ph)
-                ttype = info["type"]
-                result.append({
-                    "token": f" {info['original']}",
-                    "type":  ttype,
-                    f"_{ttype.lower()}": True,
-                })
-        else:
-            result.append(tok)
-    return result
 def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
-    """Catch remaining number/unit tokens missed by pre-tokenization."""
     result: list[dict] = []
     for tok in tokens:
         if tok["type"] not in ("BPE", "ROOT"):

+"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).
+Uses a segment-based approach: special tokens are detected and extracted
+*before* the base tokenizer runs, so they never pass through it.
+"""
 from __future__ import annotations
     "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
 }
+# ── Regex patterns ────────────────────────────────────────────────────────────
 URL_RE         = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
 MENTION_RE     = re.compile(r'@[\w\u00C0-\u024F]+')
 HASHTAG_RE     = re.compile(r'#[\w\u00C0-\u024F]+')
+# Turkish suffixes that can follow a number+apostrophe
+_NUM_SUFFIXES = sorted(
+    [
+        "nın","nin","nun","nün","dan","den","tan","ten",
+        "da","de","ta","te","ya","ye","nda","nde",
+        "yı","yi","yu","yü","nı","ni","nu","nü",
+        "lar","ler","lara","lere","ları","leri",
+        "ım","im","um","üm","ın","in","un","ün",
+        "mız","miz","muz","müz","nız","niz","nuz","nüz",
+        "dır","dir","dur","dür","tır","tir","tur","tür",
+        "ki","li","lı","lu","lü","sız","siz","suz","süz",
+        "inci","ıncı","uncu","üncü","nci","ncı",
+        "lık","lik","luk","lük",
+        "a","e","ı","i","u","ü",
+    ],
+    key=len,
+    reverse=True,
 )
+_SUFFIX_ALT = '|'.join(re.escape(s) for s in _NUM_SUFFIXES)
+# Number (or time) followed by apostrophe + Turkish suffix(es)
+NUM_APOSTROPHE_RE = re.compile(
+    r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
+    re.IGNORECASE,
+)
 DATE_RE        = re.compile(
     r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
     r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
 )
 CURRENCY_RE    = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
+NUMBER_RE      = re.compile(
+    r'%\d+[\.,]?\d*'
+    r'|\d{1,3}(?:\.\d{3})+'       # thousands (1.000.000) — before decimal!
+    r'|\d+[\.,]\d+'               # decimal (2.5, 10,5)
+    r'|\d+%'
+    r'|\d+/\d+'
+)
+TIME_RE        = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
+PLAIN_NUM_RE   = re.compile(r'\b\d+\b')
 TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
 UNICODE_EMOJI_RE = re.compile(
     "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
     flags=re.UNICODE,
 )
+# Pattern priority: earlier entries win when spans overlap.
+_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
+    (URL_RE,            "URL"),
+    (MENTION_RE,        "MENTION"),
+    (HASHTAG_RE,        "HASHTAG"),
+    (DATE_RE,           "DATE"),
+    (CURRENCY_RE,       "UNIT"),
+    (NUM_APOSTROPHE_RE, "NUM_APO"),
+    (NUMBER_RE,         "NUM"),
+    (TIME_RE,           "NUM"),
+    (PLAIN_NUM_RE,      "NUM"),
+    (UNICODE_EMOJI_RE,  "EMOJI"),
+    (TEXT_EMOJI_RE,     "EMOJI"),
+]
+# ── Segment-based API ────────────────────────────────────────────────────────
+def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
+    """Find all special-token spans in *text*.
+    Returns a sorted, non-overlapping list of
+    ``(start, end, token_type, original_text)``.
+    """
+    candidates: list[tuple[int, int, str, str]] = []
+    for pattern, ttype in _SPAN_PATTERNS:
+        for m in pattern.finditer(text):
+            candidates.append((m.start(), m.end(), ttype, m.group(0)))
+    # Sort by start position, then prefer longer match
+    candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
+    # Greedy non-overlapping selection
+    result: list[tuple[int, int, str, str]] = []
+    last_end = 0
+    for s, e, t, o in candidates:
+        if s >= last_end:
+            result.append((s, e, t, o))
+            last_end = e
+    return result
+def make_special_tokens(span_type: str, original: str) -> list[dict]:
+    """Create token dict(s) for a matched special span.
+    ``NUM_APO`` spans are split into a NUM token + SUFFIX token(s).
+    """
+    if span_type == "NUM_APO":
+        apo_pos = original.find("'")
+        if apo_pos == -1:
+            apo_pos = original.find("\u2019")
+        num_part = original[:apo_pos]
+        suffix_str = original[apo_pos + 1:]
+        tokens: list[dict] = [{"token": f" {num_part}", "type": "NUM", "_num": True}]
+        # Split suffix_str into individual Turkish suffixes
+        remaining = suffix_str.lower()
+        while remaining:
+            matched = False
+            for s in _NUM_SUFFIXES:
+                if remaining.startswith(s):
+                    tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
+                    remaining = remaining[len(s):]
+                    matched = True
+                    break
+            if not matched:
+                # Safety fallback — shouldn't happen if the regex matched
+                tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
+                break
         return tokens
+    return [{
+        "token": f" {original}",
+        "type": span_type,
+        f"_{span_type.lower()}": True,
+    }]
+# ── Safety-net post-pass ─────────────────────────────────────────────────────
 def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
+    """Catch remaining number/unit tokens missed by span detection."""
     result: list[dict] = []
     for tok in tokens:
         if tok["type"] not in ("BPE", "ROOT"):

turk_tokenizer/tokenizer.py CHANGED Viewed

@@ -37,8 +37,8 @@ from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
 from ._medical_vocab import ALL_DOMAIN_ROOTS
 from ._tdk_vocab import reclassify_foreign_words
 from ._normalizer import (
-    preprocess_special_tokens,
-    restore_special_tokens,
     reclassify_numbers_in_tokens,
 )
 from ._allomorph import add_canonical_labels
@@ -114,20 +114,35 @@ class TurkTokenizer:
         Returns a list of token dicts, each with:
             ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
         """
-        # Fix 8 pre: replace URLs, mentions, numbers etc. with placeholders
-        text_norm, specials = preprocess_special_tokens(text)
-        # Fix 1 & 2 pre: ALL CAPS + apostrophe
-        processed, caps_map, apo_splits = preprocess(text_norm)
-        # Base tokenizer
-        raw = self._base.tokenize_text(processed)
-        # Fix 8 post: restore placeholders
-        tokens = restore_special_tokens(raw, specials)
-        # Fix 1 & 2 post
-        tokens = postprocess(tokens, caps_map, apo_splits)
         # Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT
         tokens = reclassify_bpe_suffixes(tokens)

 from ._medical_vocab import ALL_DOMAIN_ROOTS
 from ._tdk_vocab import reclassify_foreign_words
 from ._normalizer import (
+    find_special_spans,
+    make_special_tokens,
     reclassify_numbers_in_tokens,
 )
 from ._allomorph import add_canonical_labels
         Returns a list of token dicts, each with:
             ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
         """
+        # Fix 8: detect special tokens (NUM, DATE, URL, MENTION, HASHTAG, …)
+        # and split text into segments so they never enter the base tokenizer.
+        spans = find_special_spans(text)
+        tokens: list[dict] = []
+        pos = 0
+        for start, end, ttype, original in spans:
+            # Tokenize normal text before this special span
+            if pos < start:
+                segment = text[pos:start]
+                if segment.strip():
+                    seg_proc, caps, apo = preprocess(segment)
+                    seg_raw = self._base.tokenize_text(seg_proc)
+                    seg_tokens = postprocess(seg_raw, caps, apo)
+                    tokens.extend(seg_tokens)
+            # Insert the special token(s) directly
+            tokens.extend(make_special_tokens(ttype, original))
+            pos = end
+        # Tokenize remaining text after the last special span
+        if pos < len(text):
+            segment = text[pos:]
+            if segment.strip():
+                seg_proc, caps, apo = preprocess(segment)
+                seg_raw = self._base.tokenize_text(seg_proc)
+                seg_tokens = postprocess(seg_raw, caps, apo)
+                tokens.extend(seg_tokens)
         # Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT
         tokens = reclassify_bpe_suffixes(tokens)