File size: 8,860 Bytes
a0e8f24
 
 
 
 
8f794ec
a0e8f24
 
 
8f794ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6330193
 
 
 
 
 
a0e8f24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6330193
a0e8f24
 
6330193
a0e8f24
 
 
183e656
8f794ec
183e656
8f794ec
 
 
 
a0e8f24
183e656
 
 
8f794ec
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e8f24
 
 
 
 
 
 
 
 
6330193
 
a0e8f24
 
 
 
 
 
 
 
 
6330193
a0e8f24
 
 
 
 
 
 
 
 
 
 
 
 
 
6330193
a0e8f24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6330193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e8f24
 
 
 
6330193
 
 
 
 
a0e8f24
 
6330193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e8f24
6330193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e8f24
 
 
 
 
 
6330193
 
 
 
 
 
a0e8f24
6330193
 
a0e8f24
 
6330193
 
 
 
a0e8f24
6330193
a0e8f24
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""

from __future__ import annotations

import re
from pathlib import Path

TR_CHARS = set("Γ§ΔŸΔ±ΕŸΓΆΓΌΓ‡ΔžΔ°ΕžΓ–Γœ")

_PROPER_NOUNS: set[str] | None = None


def _load_proper_nouns() -> set[str]:
    global _PROPER_NOUNS
    if _PROPER_NOUNS is not None:
        return _PROPER_NOUNS
    path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
    if path.exists():
        _PROPER_NOUNS = {
            line.strip().lower()
            for line in path.read_text(encoding="utf-8").splitlines()
            if line.strip() and not line.startswith("#")
        }
    else:
        _PROPER_NOUNS = set()
    return _PROPER_NOUNS


def _turkish_lower(s: str) -> str:
    """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
    return s.replace("Δ°", "i").replace("I", "Δ±").lower()


TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
    [
        "nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten",
        "da","de","ta","te","ya","ye","nda","nde",
        "yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ",
        "lar","ler","lara","lere","larΔ±","leri",
        "Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn",
        "mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz",
        "dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr",
        "ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz",
        "a","e","Δ±","i","u","ΓΌ",
    ],
    key=len,
    reverse=True,
)

_APO_RE  = re.compile(
    r"([A-Za-zΓ‡Γ§ΔžΔŸΔ°Δ±Γ–ΓΆΕžΕŸΓœΓΌ0-9]{2,})['\u2019]([A-Za-zΓ‡Γ§ΔžΔŸΔ°Δ±Γ–ΓΆΕžΕŸΓœΓΌ]{1,6})\b"
)
_CAPS_RE = re.compile(r'\b([A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,})\b')


def _is_turkish_base(word: str) -> bool:
    """Return True if the word should be treated as Turkish (don't split apostrophe)."""
    wl = _turkish_lower(word)
    # Fast path: Turkish-specific characters β†’ definitely Turkish
    if any(c in TR_CHARS for c in wl):
        return True
    # Turkish proper nouns (cities, regions) β€” not in TDK common-word list
    if wl in _load_proper_nouns():
        return True
    # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
    from ._tdk_vocab import load_tdk_words  # noqa: PLC0415
    tdk = load_tdk_words()
    if tdk and wl in tdk:
        return True
    # Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmir…)
    try:
        from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
        if ZEMBEREK_AVAILABLE and _morphology:
            for analysis in _morphology.analyze(wl):
                lemma = str(analysis).split("]")[0].lstrip("[")
                if any(c in TR_CHARS for c in lemma):
                    return True
    except Exception:  # noqa: BLE001
        pass
    # TDK unavailable + Zemberek unavailable: very short words are ambiguous
    return len(wl) < 4


# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────

def _fix_all_caps(text: str) -> tuple[str, set]:
    caps: set[str] = set()

    def _replace(m: re.Match) -> str:
        w = m.group(1)
        caps.add(_turkish_lower(w))
        return _turkish_lower(w)

    return _CAPS_RE.sub(_replace, text), caps


def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
    result: list[dict] = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        raw_low = _turkish_lower(tok["token"].strip())

        if tok["type"] == "ROOT" and raw_low in caps:
            result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
            result.append(tok)
            i += 1
            continue

        if tok["type"] == "BPE" and tok["token"].startswith(" "):
            combined  = raw_low
            lookahead = [tok]
            j = i + 1
            while j < len(tokens):
                nt = tokens[j]
                if not nt["token"].startswith(" "):
                    combined += _turkish_lower(nt["token"].strip())
                    lookahead.append(nt)
                    j += 1
                    if combined in caps:
                        break
                    if len(combined) > 8:
                        break
                else:
                    break
            if combined in caps:
                result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
                result.append({"token": f" {combined}", "type": "ROOT",
                                "_acronym": True, "_caps": True})
                i = j
                continue

        result.append(tok)
        i += 1

    return result


# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
#
# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
# then marks the following word-initial suffix token as SUFFIX.
#
# Old approach used a \ue001 separator β€” the base tokenizer converts that to
# '<unknown>' so the separator was never found. Simple-space + pair-list is
# robust regardless of how the tokenizer handles the input.

def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
    """
    Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe β†’ space).
    Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
    Turkish proper names (Δ°stanbul'da) are left unchanged.
    """
    splits: list[tuple[str, str]] = []

    def _repl(m: re.Match) -> str:
        base, suffix = m.group(1), m.group(2)
        if _is_turkish_base(base):
            return m.group(0)          # leave Turkish names alone
        sl = suffix.lower()
        if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
            splits.append((_turkish_lower(base), sl))
            return f"{base} {suffix}"  # just drop the apostrophe
        return m.group(0)

    return _APO_RE.sub(_repl, text), splits


def _merge_apostrophe_tokens(
    tokens: list[dict], apo_splits: list[tuple[str, str]]
) -> list[dict]:
    """
    For each (foreign_base, suffix) pair recorded during _split_apostrophe,
    find the consecutive BPE/ROOT pieces that together spell foreign_base,
    merge them into one FOREIGN ROOT token, and mark the next word-initial
    token whose stripped form == suffix as SUFFIX.
    """
    if not apo_splits:
        return tokens

    result = list(tokens)

    for foreign_base, suffix in apo_splits:
        n = len(result)
        for j in range(1, n):
            tok_j = result[j]
            # Candidate suffix token: word-initial, stripped == suffix
            if not tok_j["token"].startswith(" "):
                continue
            if _turkish_lower(tok_j["token"].strip()) != suffix:
                continue

            # Walk back to find pieces of the word before j (no leading space)
            word_start = j - 1
            while word_start > 0 and not result[word_start]["token"].startswith(" "):
                word_start -= 1

            pieces = result[word_start:j]
            if not pieces:
                continue

            combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
            if combined != foreign_base:
                continue

            # Merge pieces into one FOREIGN ROOT
            merged = pieces[0]["token"]        # keeps leading space
            for p in pieces[1:]:
                merged += p["token"].strip()

            new_root = {"token": merged, "type": "ROOT", "_foreign": True}
            new_suf  = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}

            result = (
                result[:word_start]
                + [new_root, new_suf]
                + result[j + 1:]
            )
            break   # this pair is handled

    return result


# ── Combined pre / post ───────────────────────────────────────────────────────

def preprocess(text: str) -> tuple[str, set, list]:
    """Prepare text before base tokenization.

    Returns:
        (modified_text, caps_set, apo_splits)
    """
    text, caps = _fix_all_caps(text)
    text, apo_splits = _split_apostrophe(text)
    return text, caps, apo_splits


def postprocess(
    tokens: list[dict], caps: set, apo_splits: list | None = None
) -> list[dict]:
    """Fix tokens after base tokenization."""
    tokens = _restore_caps_tokens(tokens, caps)
    tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
    return tokens