File size: 5,544 Bytes

ca41c16

"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""

from __future__ import annotations

import re

TR_CHARS = set("çğışöüÇĞİŞÖÜ")

KNOWN_TURKISH_BASES = {
    "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
    "cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
    "temmuz", "ocak", "şubat", "mart", "nisan", "mayıs", "haziran",
    "ağustos", "eylül", "ekim", "kasım", "aralık",
}

KNOWN_FOREIGN_BASES = {
    "python", "zoom", "google", "github", "twitter", "youtube",
    "instagram", "linkedin", "facebook", "whatsapp", "telegram",
    "numpy", "pandas", "django", "flask", "react", "javascript",
    "typescript", "docker", "linux", "windows", "android", "iphone",
    "chatgpt", "openai", "claude", "gemini", "llama", "bert",
    "excel", "powerpoint", "outlook", "teams", "slack", "notion",
    "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
}

TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
    [
        "nın","nin","nun","nün","dan","den","tan","ten",
        "da","de","ta","te","ya","ye","nda","nde",
        "yı","yi","yu","yü","nı","ni","nu","nü",
        "lar","ler","lara","lere","ları","leri",
        "ım","im","um","üm","ın","in","un","ün",
        "mız","miz","muz","müz","nız","niz","nuz","nüz",
        "dır","dir","dur","dür","tır","tir","tur","tür",
        "ki","li","lı","lu","lü","sız","siz","suz","süz",
        "a","e","ı","i","u","ü",
    ],
    key=len,
    reverse=True,
)

_APO_SEP   = "\ue001"
_APO_RE    = re.compile(
    r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
)
_CAPS_RE   = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')


def _is_turkish_base(word: str) -> bool:
    w = word.lower()
    if w in KNOWN_FOREIGN_BASES:
        return False
    if any(c in TR_CHARS for c in word):
        return True
    if w in KNOWN_TURKISH_BASES:
        return True
    if len(w) < 4:
        return True
    return False


# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────

def _fix_all_caps(text: str) -> tuple[str, set]:
    caps: set[str] = set()

    def _replace(m: re.Match) -> str:
        w = m.group(1)
        caps.add(w.lower())
        return w.lower()

    return _CAPS_RE.sub(_replace, text), caps


def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
    result: list[dict] = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        raw_low = tok["token"].strip().lower()

        if tok["type"] == "ROOT" and raw_low in caps:
            result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
            result.append(tok)
            i += 1
            continue

        if tok["type"] == "BPE" and tok["token"].startswith(" "):
            combined  = raw_low
            lookahead = [tok]
            j = i + 1
            while j < len(tokens):
                nt = tokens[j]
                if not nt["token"].startswith(" "):
                    combined += nt["token"].strip().lower()
                    lookahead.append(nt)
                    j += 1
                    if combined in caps:
                        break
                    if len(combined) > 8:
                        break
                else:
                    break
            if combined in caps:
                result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
                result.append({"token": f" {combined}", "type": "ROOT",
                                "_acronym": True, "_caps": True})
                i = j
                continue

        result.append(tok)
        i += 1

    return result


# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────

def _split_apostrophe(text: str) -> str:
    def _repl(m: re.Match) -> str:
        base, suffix = m.group(1), m.group(2)
        if _is_turkish_base(base):
            return m.group(0)
        if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
            return f"{base} {_APO_SEP} {suffix}"
        return m.group(0)

    return _APO_RE.sub(_repl, text)


def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
    result: list[dict] = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        if _APO_SEP in tok["token"].strip():
            if result:
                result[-1]["type"]     = "ROOT"
                result[-1]["_foreign"] = True
            i += 1
            if i < len(tokens):
                tokens[i]["type"]       = "SUFFIX"
                tokens[i]["_apo_suffix"] = True
                result.append(tokens[i])
                i += 1
        else:
            result.append(tok)
            i += 1
    return result


# ── Combined pre / post ───────────────────────────────────────────────────────

def preprocess(text: str) -> tuple[str, set]:
    text, caps = _fix_all_caps(text)
    text = _split_apostrophe(text)
    return text, caps


def postprocess(tokens: list[dict], caps: set) -> list[dict]:
    tokens = _restore_caps_tokens(tokens, caps)
    tokens = _merge_apostrophe_tokens(tokens)
    return tokens