Add smart ACRONYM detection: TDK-based disambiguation for uppercase tokens

Browse files

Files changed (3) hide show

turk_tokenizer/_acronym_dict.py +19 -5
turk_tokenizer/_normalizer.py +123 -32
turk_tokenizer/tokenizer.py +2 -2

turk_tokenizer/_acronym_dict.py CHANGED Viewed

@@ -83,13 +83,27 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
 def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
-    """Add ``_expansion`` field to known acronyms in the token stream."""
     result: list[dict] = []
     for tok in tokens:
-        if tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
-            expansion = ACRONYM_EXPANSIONS.get(tok["token"].strip().upper())
             if expansion:
                 result.append({**tok, "_expansion": expansion, "_known_acronym": True})
-                continue
-        result.append(tok)
     return result

 def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
+    """Add ``_expansion`` to known acronyms; promote CAPS ROOTs to ACRONYM."""
     result: list[dict] = []
     for tok in tokens:
+        token_upper = tok["token"].strip().upper()
+        expansion = ACRONYM_EXPANSIONS.get(token_upper)
+        if tok["type"] == "ACRONYM":
+            # Already typed as ACRONYM by span detection — add expansion
             if expansion:
                 result.append({**tok, "_expansion": expansion, "_known_acronym": True})
+            else:
+                result.append(tok)
+        elif tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
+            # ALL CAPS ROOT that's in the acronym dict → promote to ACRONYM
+            if expansion:
+                result.append({
+                    **tok, "type": "ACRONYM",
+                    "_expansion": expansion, "_known_acronym": True,
+                })
+            else:
+                result.append(tok)
+        else:
+            result.append(tok)
     return result

turk_tokenizer/_normalizer.py CHANGED Viewed

@@ -78,6 +78,22 @@ NUMBER_RE      = re.compile(
 TIME_RE        = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
 PLAIN_NUM_RE   = re.compile(r'\b\d+\b')
 TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
 UNICODE_EMOJI_RE = re.compile(
     "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
@@ -89,20 +105,60 @@ UNICODE_EMOJI_RE = re.compile(
 # Pattern priority: earlier entries win when spans overlap.
 _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
-    (URL_RE,            "URL"),
-    (MENTION_RE,        "MENTION"),
-    (HASHTAG_RE,        "HASHTAG"),
-    (DATE_RE,           "DATE"),
-    (CURRENCY_RE,       "UNIT"),
-    (NUM_APOSTROPHE_RE, "NUM_APO"),
-    (NUMBER_RE,         "NUM"),
-    (TIME_RE,           "NUM"),
-    (PLAIN_NUM_RE,      "NUM"),
-    (UNICODE_EMOJI_RE,  "EMOJI"),
-    (TEXT_EMOJI_RE,     "EMOJI"),
 ]
 # ── Segment-based API ────────────────────────────────────────────────────────
 def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
@@ -114,7 +170,22 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
     candidates: list[tuple[int, int, str, str]] = []
     for pattern, ttype in _SPAN_PATTERNS:
         for m in pattern.finditer(text):
-            candidates.append((m.start(), m.end(), ttype, m.group(0)))
     # Sort by start position, then prefer longer match
     candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
@@ -129,36 +200,56 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
     return result
 def make_special_tokens(span_type: str, original: str) -> list[dict]:
     """Create token dict(s) for a matched special span.
-    ``NUM_APO`` spans are split into a NUM token + SUFFIX token(s).
     """
     if span_type == "NUM_APO":
         apo_pos = original.find("'")
         if apo_pos == -1:
             apo_pos = original.find("\u2019")
         num_part = original[:apo_pos]
-        suffix_str = original[apo_pos + 1:]
-        tokens: list[dict] = [{"token": f" {num_part}", "type": "NUM", "_num": True}]
-        # Split suffix_str into individual Turkish suffixes
-        remaining = suffix_str.lower()
-        while remaining:
-            matched = False
-            for s in _NUM_SUFFIXES:
-                if remaining.startswith(s):
-                    tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
-                    remaining = remaining[len(s):]
-                    matched = True
-                    break
-            if not matched:
-                # Safety fallback — shouldn't happen if the regex matched
-                tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
-                break
-        return tokens
     return [{
         "token": f" {original}",
         "type": span_type,

 TIME_RE        = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
 PLAIN_NUM_RE   = re.compile(r'\b\d+\b')
+# ── Acronym patterns ─────────────────────────────────────────────────────────
+# Matches standalone uppercase sequences (+ optional trailing digits).
+#   [A-Z]{2,}[0-9]*  → HTML, GPT, CSS3, HTML5, MP3
+#   [A-Z][0-9]+      → F16, H264, A4
+# Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
+ACRONYM_RE = re.compile(
+    r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b"
+    r"|\b[A-ZÇĞİÖŞÜ][0-9]+\b"
+)
+# Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten
+ACRONYM_APOSTROPHE_RE = re.compile(
+    r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:"
+    + _SUFFIX_ALT + r")+\b"
+)
 TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
 UNICODE_EMOJI_RE = re.compile(
     "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
 # Pattern priority: earlier entries win when spans overlap.
 _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
+    (URL_RE,                 "URL"),
+    (MENTION_RE,             "MENTION"),
+    (HASHTAG_RE,             "HASHTAG"),
+    (DATE_RE,                "DATE"),
+    (CURRENCY_RE,            "UNIT"),
+    (NUM_APOSTROPHE_RE,      "NUM_APO"),
+    (ACRONYM_APOSTROPHE_RE,  "ACRONYM_APO"),
+    (ACRONYM_RE,             "ACRONYM"),
+    (NUMBER_RE,              "NUM"),
+    (TIME_RE,                "NUM"),
+    (PLAIN_NUM_RE,           "NUM"),
+    (UNICODE_EMOJI_RE,       "EMOJI"),
+    (TEXT_EMOJI_RE,          "EMOJI"),
 ]
+# ── Acronym vs Turkish word disambiguation ───────────────────────────────────
+def _is_known_turkish_word(word_upper: str) -> bool:
+    """Return True if *word_upper* (ALL CAPS) is a known Turkish word.
+    Checks (in order):
+    1. ACRONYM_EXPANSIONS dict → always acronym (return False)
+    2. Same dict without trailing digits (HTML5 → HTML)
+    3. TDK dictionary → Turkish word (return True)
+    4. Proper nouns list → Turkish word (return True)
+    5. Otherwise → treat as acronym (return False)
+    """
+    from ._acronym_dict import ACRONYM_EXPANSIONS  # noqa: PLC0415
+    from ._preprocessor import _turkish_lower, _load_proper_nouns  # noqa: PLC0415
+    from ._tdk_vocab import load_tdk_words  # noqa: PLC0415
+    # Known acronyms always win
+    if word_upper in ACRONYM_EXPANSIONS:
+        return False
+    # Also check without trailing digits (HTML5 → HTML)
+    base = word_upper.rstrip("0123456789")
+    if base and base != word_upper and base in ACRONYM_EXPANSIONS:
+        return False
+    wl = _turkish_lower(word_upper)
+    # TDK dictionary: if the lowercase form is a real Turkish word → not acronym
+    tdk = load_tdk_words()
+    if tdk and wl in tdk:
+        return True
+    # Proper nouns (İstanbul, Ankara…)
+    if wl in _load_proper_nouns():
+        return True
+    return False
 # ── Segment-based API ────────────────────────────────────────────────────────
 def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
     candidates: list[tuple[int, int, str, str]] = []
     for pattern, ttype in _SPAN_PATTERNS:
         for m in pattern.finditer(text):
+            original = m.group(0)
+            # Acronym filtering: skip if it's actually a Turkish word
+            if ttype in ("ACRONYM", "ACRONYM_APO"):
+                # Extract the uppercase base (before apostrophe for APO)
+                if ttype == "ACRONYM_APO":
+                    apo = original.find("'")
+                    if apo == -1:
+                        apo = original.find("\u2019")
+                    acr_base = original[:apo]
+                else:
+                    acr_base = original
+                if _is_known_turkish_word(acr_base):
+                    continue
+            candidates.append((m.start(), m.end(), ttype, original))
     # Sort by start position, then prefer longer match
     candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
     return result
+def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]:
+    """Split a suffix string (after apostrophe) into individual SUFFIX tokens."""
+    tokens: list[dict] = []
+    remaining = suffix_str.lower()
+    while remaining:
+        matched = False
+        for s in _NUM_SUFFIXES:
+            if remaining.startswith(s):
+                tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
+                remaining = remaining[len(s):]
+                matched = True
+                break
+        if not matched:
+            tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
+            break
+    return tokens
 def make_special_tokens(span_type: str, original: str) -> list[dict]:
     """Create token dict(s) for a matched special span.
+    ``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens.
     """
+    # ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
     if span_type == "NUM_APO":
         apo_pos = original.find("'")
         if apo_pos == -1:
             apo_pos = original.find("\u2019")
         num_part = original[:apo_pos]
+        return [
+            {"token": f" {num_part}", "type": "NUM", "_num": True},
+            *_split_apostrophe_suffixes(original[apo_pos + 1:]),
+        ]
+    # ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
+    if span_type == "ACRONYM_APO":
+        apo_pos = original.find("'")
+        if apo_pos == -1:
+            apo_pos = original.find("\u2019")
+        acr_part = original[:apo_pos]
+        return [
+            {"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True},
+            *_split_apostrophe_suffixes(original[apo_pos + 1:]),
+        ]
+    # ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
+    if span_type == "ACRONYM":
+        return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}]
+    # ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
     return [{
         "token": f" {original}",
         "type": span_type,

turk_tokenizer/tokenizer.py CHANGED Viewed

@@ -56,13 +56,13 @@ _DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}
 # ── Token types ───────────────────────────────────────────────────────────────
 _SPECIAL_TYPES = frozenset(
-    ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI")
 )
 _TYPE_SYM = {
     "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
     "NUM": "N", "DATE": "D", "UNIT": "U",
-    "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E",
 }

 # ── Token types ───────────────────────────────────────────────────────────────
 _SPECIAL_TYPES = frozenset(
+    ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM")
 )
 _TYPE_SYM = {
     "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
     "NUM": "N", "DATE": "D", "UNIT": "U",
+    "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E", "ACRONYM": "A",
 }