File size: 5,544 Bytes
ca41c16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | """Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""
from __future__ import annotations
import re
TR_CHARS = set("Γ§ΔΔ±ΕΓΆΓΌΓΔΔ°ΕΓΓ")
KNOWN_TURKISH_BASES = {
"istanbul", "ankara", "izmir", "tΓΌrkiye", "anadolu", "boΔaziΓ§i",
"cumhuriyet", "atatΓΌrk", "karadeniz", "marmara", "ege", "akdeniz",
"temmuz", "ocak", "Εubat", "mart", "nisan", "mayΔ±s", "haziran",
"aΔustos", "eylΓΌl", "ekim", "kasΔ±m", "aralΔ±k",
}
KNOWN_FOREIGN_BASES = {
"python", "zoom", "google", "github", "twitter", "youtube",
"instagram", "linkedin", "facebook", "whatsapp", "telegram",
"numpy", "pandas", "django", "flask", "react", "javascript",
"typescript", "docker", "linux", "windows", "android", "iphone",
"chatgpt", "openai", "claude", "gemini", "llama", "bert",
"excel", "powerpoint", "outlook", "teams", "slack", "notion",
"spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
}
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
[
"nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten",
"da","de","ta","te","ya","ye","nda","nde",
"yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ",
"lar","ler","lara","lere","larΔ±","leri",
"Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn",
"mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz",
"dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr",
"ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz",
"a","e","Δ±","i","u","ΓΌ",
],
key=len,
reverse=True,
)
_APO_SEP = "\ue001"
_APO_RE = re.compile(
r"([A-Za-zΓΓ§ΔΔΔ°Δ±ΓΓΆΕΕΓΓΌ0-9]{2,})['\u2019]([A-Za-zΓΓ§ΔΔΔ°Δ±ΓΓΆΕΕΓΓΌ]{1,6})\b"
)
_CAPS_RE = re.compile(r'\b([A-ZΓΔΔ°ΓΕΓ]{2,})\b')
def _is_turkish_base(word: str) -> bool:
w = word.lower()
if w in KNOWN_FOREIGN_BASES:
return False
if any(c in TR_CHARS for c in word):
return True
if w in KNOWN_TURKISH_BASES:
return True
if len(w) < 4:
return True
return False
# ββ Fix 1: ALL CAPS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _fix_all_caps(text: str) -> tuple[str, set]:
caps: set[str] = set()
def _replace(m: re.Match) -> str:
w = m.group(1)
caps.add(w.lower())
return w.lower()
return _CAPS_RE.sub(_replace, text), caps
def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
result: list[dict] = []
i = 0
while i < len(tokens):
tok = tokens[i]
raw_low = tok["token"].strip().lower()
if tok["type"] == "ROOT" and raw_low in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append(tok)
i += 1
continue
if tok["type"] == "BPE" and tok["token"].startswith(" "):
combined = raw_low
lookahead = [tok]
j = i + 1
while j < len(tokens):
nt = tokens[j]
if not nt["token"].startswith(" "):
combined += nt["token"].strip().lower()
lookahead.append(nt)
j += 1
if combined in caps:
break
if len(combined) > 8:
break
else:
break
if combined in caps:
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
result.append({"token": f" {combined}", "type": "ROOT",
"_acronym": True, "_caps": True})
i = j
continue
result.append(tok)
i += 1
return result
# ββ Fix 2: Apostrophe split βββββββββββββββββββββββββββββββββββββββββββββββββββ
def _split_apostrophe(text: str) -> str:
def _repl(m: re.Match) -> str:
base, suffix = m.group(1), m.group(2)
if _is_turkish_base(base):
return m.group(0)
if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
return f"{base} {_APO_SEP} {suffix}"
return m.group(0)
return _APO_RE.sub(_repl, text)
def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
result: list[dict] = []
i = 0
while i < len(tokens):
tok = tokens[i]
if _APO_SEP in tok["token"].strip():
if result:
result[-1]["type"] = "ROOT"
result[-1]["_foreign"] = True
i += 1
if i < len(tokens):
tokens[i]["type"] = "SUFFIX"
tokens[i]["_apo_suffix"] = True
result.append(tokens[i])
i += 1
else:
result.append(tok)
i += 1
return result
# ββ Combined pre / post βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def preprocess(text: str) -> tuple[str, set]:
text, caps = _fix_all_caps(text)
text = _split_apostrophe(text)
return text, caps
def postprocess(tokens: list[dict], caps: set) -> list[dict]:
tokens = _restore_caps_tokens(tokens, caps)
tokens = _merge_apostrophe_tokens(tokens)
return tokens
|