File size: 8,860 Bytes

"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""

from __future__ import annotations

import re
from pathlib import Path

TR_CHARS = set("çğışöüÇĞİŞÖÜ")

_PROPER_NOUNS: set[str] | None = None


def _load_proper_nouns() -> set[str]:
    global _PROPER_NOUNS
    if _PROPER_NOUNS is not None:
        return _PROPER_NOUNS
    path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
    if path.exists():
        _PROPER_NOUNS = {
            line.strip().lower()
            for line in path.read_text(encoding="utf-8").splitlines()
            if line.strip() and not line.startswith("#")
        }
    else:
        _PROPER_NOUNS = set()
    return _PROPER_NOUNS


def _turkish_lower(s: str) -> str:
    """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
    return s.replace("İ", "i").replace("I", "ı").lower()


TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
    [
        "nın","nin","nun","nün","dan","den","tan","ten",
        "da","de","ta","te","ya","ye","nda","nde",
        "yı","yi","yu","yü","nı","ni","nu","nü",
        "lar","ler","lara","lere","ları","leri",
        "ım","im","um","üm","ın","in","un","ün",
        "mız","miz","muz","müz","nız","niz","nuz","nüz",
        "dır","dir","dur","dür","tır","tir","tur","tür",
        "ki","li","lı","lu","lü","sız","siz","suz","süz",
        "a","e","ı","i","u","ü",
    ],
    key=len,
    reverse=True,
)

_APO_RE  = re.compile(
    r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
)
_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')


def _is_turkish_base(word: str) -> bool:
    """Return True if the word should be treated as Turkish (don't split apostrophe)."""
    wl = _turkish_lower(word)
    # Fast path: Turkish-specific characters → definitely Turkish
    if any(c in TR_CHARS for c in wl):
        return True
    # Turkish proper nouns (cities, regions) — not in TDK common-word list
    if wl in _load_proper_nouns():
        return True
    # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
    from ._tdk_vocab import load_tdk_words  # noqa: PLC0415
    tdk = load_tdk_words()
    if tdk and wl in tdk:
        return True
    # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…)
    try:
        from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
        if ZEMBEREK_AVAILABLE and _morphology:
            for analysis in _morphology.analyze(wl):
                lemma = str(analysis).split("]")[0].lstrip("[")
                if any(c in TR_CHARS for c in lemma):
                    return True
    except Exception:  # noqa: BLE001
        pass
    # TDK unavailable + Zemberek unavailable: very short words are ambiguous
    return len(wl) < 4


# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────

def _fix_all_caps(text: str) -> tuple[str, set]:
    caps: set[str] = set()

    def _replace(m: re.Match) -> str:
        w = m.group(1)
        caps.add(_turkish_lower(w))
        return _turkish_lower(w)

    return _CAPS_RE.sub(_replace, text), caps


def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
    result: list[dict] = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        raw_low = _turkish_lower(tok["token"].strip())

        if tok["type"] == "ROOT" and raw_low in caps:
            result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
            result.append(tok)
            i += 1
            continue

        if tok["type"] == "BPE" and tok["token"].startswith(" "):
            combined  = raw_low
            lookahead = [tok]
            j = i + 1
            while j < len(tokens):
                nt = tokens[j]
                if not nt["token"].startswith(" "):
                    combined += _turkish_lower(nt["token"].strip())
                    lookahead.append(nt)
                    j += 1
                    if combined in caps:
                        break
                    if len(combined) > 8:
                        break
                else:
                    break
            if combined in caps:
                result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
                result.append({"token": f" {combined}", "type": "ROOT",
                                "_acronym": True, "_caps": True})
                i = j
                continue

        result.append(tok)
        i += 1

    return result


# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
#
# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
# then marks the following word-initial suffix token as SUFFIX.
#
# Old approach used a \ue001 separator — the base tokenizer converts that to
# '<unknown>' so the separator was never found. Simple-space + pair-list is
# robust regardless of how the tokenizer handles the input.

def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
    """
    Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space).
    Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
    Turkish proper names (İstanbul'da) are left unchanged.
    """
    splits: list[tuple[str, str]] = []

    def _repl(m: re.Match) -> str:
        base, suffix = m.group(1), m.group(2)
        if _is_turkish_base(base):
            return m.group(0)          # leave Turkish names alone
        sl = suffix.lower()
        if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
            splits.append((_turkish_lower(base), sl))
            return f"{base} {suffix}"  # just drop the apostrophe
        return m.group(0)

    return _APO_RE.sub(_repl, text), splits


def _merge_apostrophe_tokens(
    tokens: list[dict], apo_splits: list[tuple[str, str]]
) -> list[dict]:
    """
    For each (foreign_base, suffix) pair recorded during _split_apostrophe,
    find the consecutive BPE/ROOT pieces that together spell foreign_base,
    merge them into one FOREIGN ROOT token, and mark the next word-initial
    token whose stripped form == suffix as SUFFIX.
    """
    if not apo_splits:
        return tokens

    result = list(tokens)

    for foreign_base, suffix in apo_splits:
        n = len(result)
        for j in range(1, n):
            tok_j = result[j]
            # Candidate suffix token: word-initial, stripped == suffix
            if not tok_j["token"].startswith(" "):
                continue
            if _turkish_lower(tok_j["token"].strip()) != suffix:
                continue

            # Walk back to find pieces of the word before j (no leading space)
            word_start = j - 1
            while word_start > 0 and not result[word_start]["token"].startswith(" "):
                word_start -= 1

            pieces = result[word_start:j]
            if not pieces:
                continue

            combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
            if combined != foreign_base:
                continue

            # Merge pieces into one FOREIGN ROOT
            merged = pieces[0]["token"]        # keeps leading space
            for p in pieces[1:]:
                merged += p["token"].strip()

            new_root = {"token": merged, "type": "ROOT", "_foreign": True}
            new_suf  = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}

            result = (
                result[:word_start]
                + [new_root, new_suf]
                + result[j + 1:]
            )
            break   # this pair is handled

    return result


# ── Combined pre / post ───────────────────────────────────────────────────────

def preprocess(text: str) -> tuple[str, set, list]:
    """Prepare text before base tokenization.

    Returns:
        (modified_text, caps_set, apo_splits)
    """
    text, caps = _fix_all_caps(text)
    text, apo_splits = _split_apostrophe(text)
    return text, caps, apo_splits


def postprocess(
    tokens: list[dict], caps: set, apo_splits: list | None = None
) -> list[dict]:
    """Fix tokens after base tokenization."""
    tokens = _restore_caps_tokens(tokens, caps)
    tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
    return tokens