File size: 5,544 Bytes
ca41c16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""

from __future__ import annotations

import re

TR_CHARS = set("Γ§ΔŸΔ±ΕŸΓΆΓΌΓ‡ΔžΔ°ΕžΓ–Γœ")

KNOWN_TURKISH_BASES = {
    "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
    "cumhuriyet", "atatΓΌrk", "karadeniz", "marmara", "ege", "akdeniz",
    "temmuz", "ocak", "şubat", "mart", "nisan", "mayıs", "haziran",
    "ağustos", "eylül", "ekim", "kasım", "aralık",
}

KNOWN_FOREIGN_BASES = {
    "python", "zoom", "google", "github", "twitter", "youtube",
    "instagram", "linkedin", "facebook", "whatsapp", "telegram",
    "numpy", "pandas", "django", "flask", "react", "javascript",
    "typescript", "docker", "linux", "windows", "android", "iphone",
    "chatgpt", "openai", "claude", "gemini", "llama", "bert",
    "excel", "powerpoint", "outlook", "teams", "slack", "notion",
    "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
}

TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
    [
        "nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten",
        "da","de","ta","te","ya","ye","nda","nde",
        "yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ",
        "lar","ler","lara","lere","larΔ±","leri",
        "Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn",
        "mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz",
        "dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr",
        "ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz",
        "a","e","Δ±","i","u","ΓΌ",
    ],
    key=len,
    reverse=True,
)

_APO_SEP   = "\ue001"
_APO_RE    = re.compile(
    r"([A-Za-zΓ‡Γ§ΔžΔŸΔ°Δ±Γ–ΓΆΕžΕŸΓœΓΌ0-9]{2,})['\u2019]([A-Za-zΓ‡Γ§ΔžΔŸΔ°Δ±Γ–ΓΆΕžΕŸΓœΓΌ]{1,6})\b"
)
_CAPS_RE   = re.compile(r'\b([A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,})\b')


def _is_turkish_base(word: str) -> bool:
    w = word.lower()
    if w in KNOWN_FOREIGN_BASES:
        return False
    if any(c in TR_CHARS for c in word):
        return True
    if w in KNOWN_TURKISH_BASES:
        return True
    if len(w) < 4:
        return True
    return False


# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────

def _fix_all_caps(text: str) -> tuple[str, set]:
    caps: set[str] = set()

    def _replace(m: re.Match) -> str:
        w = m.group(1)
        caps.add(w.lower())
        return w.lower()

    return _CAPS_RE.sub(_replace, text), caps


def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
    result: list[dict] = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        raw_low = tok["token"].strip().lower()

        if tok["type"] == "ROOT" and raw_low in caps:
            result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
            result.append(tok)
            i += 1
            continue

        if tok["type"] == "BPE" and tok["token"].startswith(" "):
            combined  = raw_low
            lookahead = [tok]
            j = i + 1
            while j < len(tokens):
                nt = tokens[j]
                if not nt["token"].startswith(" "):
                    combined += nt["token"].strip().lower()
                    lookahead.append(nt)
                    j += 1
                    if combined in caps:
                        break
                    if len(combined) > 8:
                        break
                else:
                    break
            if combined in caps:
                result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
                result.append({"token": f" {combined}", "type": "ROOT",
                                "_acronym": True, "_caps": True})
                i = j
                continue

        result.append(tok)
        i += 1

    return result


# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────

def _split_apostrophe(text: str) -> str:
    def _repl(m: re.Match) -> str:
        base, suffix = m.group(1), m.group(2)
        if _is_turkish_base(base):
            return m.group(0)
        if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
            return f"{base} {_APO_SEP} {suffix}"
        return m.group(0)

    return _APO_RE.sub(_repl, text)


def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
    result: list[dict] = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        if _APO_SEP in tok["token"].strip():
            if result:
                result[-1]["type"]     = "ROOT"
                result[-1]["_foreign"] = True
            i += 1
            if i < len(tokens):
                tokens[i]["type"]       = "SUFFIX"
                tokens[i]["_apo_suffix"] = True
                result.append(tokens[i])
                i += 1
        else:
            result.append(tok)
            i += 1
    return result


# ── Combined pre / post ───────────────────────────────────────────────────────

def preprocess(text: str) -> tuple[str, set]:
    text, caps = _fix_all_caps(text)
    text = _split_apostrophe(text)
    return text, caps


def postprocess(tokens: list[dict], caps: set) -> list[dict]:
    tokens = _restore_caps_tokens(tokens, caps)
    tokens = _merge_apostrophe_tokens(tokens)
    return tokens