| """Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split.""" |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| TR_CHARS = set("çğışöüÇĞİŞÖÜ") |
|
|
| KNOWN_TURKISH_BASES = { |
| "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi", |
| "cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz", |
| "temmuz", "ocak", "şubat", "mart", "nisan", "mayıs", "haziran", |
| "ağustos", "eylül", "ekim", "kasım", "aralık", |
| } |
|
|
| KNOWN_FOREIGN_BASES = { |
| "python", "zoom", "google", "github", "twitter", "youtube", |
| "instagram", "linkedin", "facebook", "whatsapp", "telegram", |
| "numpy", "pandas", "django", "flask", "react", "javascript", |
| "typescript", "docker", "linux", "windows", "android", "iphone", |
| "chatgpt", "openai", "claude", "gemini", "llama", "bert", |
| "excel", "powerpoint", "outlook", "teams", "slack", "notion", |
| "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung", |
| } |
|
|
| TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted( |
| [ |
| "nın","nin","nun","nün","dan","den","tan","ten", |
| "da","de","ta","te","ya","ye","nda","nde", |
| "yı","yi","yu","yü","nı","ni","nu","nü", |
| "lar","ler","lara","lere","ları","leri", |
| "ım","im","um","üm","ın","in","un","ün", |
| "mız","miz","muz","müz","nız","niz","nuz","nüz", |
| "dır","dir","dur","dür","tır","tir","tur","tür", |
| "ki","li","lı","lu","lü","sız","siz","suz","süz", |
| "a","e","ı","i","u","ü", |
| ], |
| key=len, |
| reverse=True, |
| ) |
|
|
| _APO_SEP = "\ue001" |
| _APO_RE = re.compile( |
| r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b" |
| ) |
| _CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b') |
|
|
|
|
| def _is_turkish_base(word: str) -> bool: |
| w = word.lower() |
| if w in KNOWN_FOREIGN_BASES: |
| return False |
| if any(c in TR_CHARS for c in word): |
| return True |
| if w in KNOWN_TURKISH_BASES: |
| return True |
| if len(w) < 4: |
| return True |
| return False |
|
|
|
|
| |
|
|
| def _fix_all_caps(text: str) -> tuple[str, set]: |
| caps: set[str] = set() |
|
|
| def _replace(m: re.Match) -> str: |
| w = m.group(1) |
| caps.add(w.lower()) |
| return w.lower() |
|
|
| return _CAPS_RE.sub(_replace, text), caps |
|
|
|
|
| def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]: |
| result: list[dict] = [] |
| i = 0 |
| while i < len(tokens): |
| tok = tokens[i] |
| raw_low = tok["token"].strip().lower() |
|
|
| if tok["type"] == "ROOT" and raw_low in caps: |
| result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True}) |
| result.append(tok) |
| i += 1 |
| continue |
|
|
| if tok["type"] == "BPE" and tok["token"].startswith(" "): |
| combined = raw_low |
| lookahead = [tok] |
| j = i + 1 |
| while j < len(tokens): |
| nt = tokens[j] |
| if not nt["token"].startswith(" "): |
| combined += nt["token"].strip().lower() |
| lookahead.append(nt) |
| j += 1 |
| if combined in caps: |
| break |
| if len(combined) > 8: |
| break |
| else: |
| break |
| if combined in caps: |
| result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True}) |
| result.append({"token": f" {combined}", "type": "ROOT", |
| "_acronym": True, "_caps": True}) |
| i = j |
| continue |
|
|
| result.append(tok) |
| i += 1 |
|
|
| return result |
|
|
|
|
| |
|
|
| def _split_apostrophe(text: str) -> str: |
| def _repl(m: re.Match) -> str: |
| base, suffix = m.group(1), m.group(2) |
| if _is_turkish_base(base): |
| return m.group(0) |
| if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE): |
| return f"{base} {_APO_SEP} {suffix}" |
| return m.group(0) |
|
|
| return _APO_RE.sub(_repl, text) |
|
|
|
|
| def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]: |
| result: list[dict] = [] |
| i = 0 |
| while i < len(tokens): |
| tok = tokens[i] |
| if _APO_SEP in tok["token"].strip(): |
| if result: |
| result[-1]["type"] = "ROOT" |
| result[-1]["_foreign"] = True |
| i += 1 |
| if i < len(tokens): |
| tokens[i]["type"] = "SUFFIX" |
| tokens[i]["_apo_suffix"] = True |
| result.append(tokens[i]) |
| i += 1 |
| else: |
| result.append(tok) |
| i += 1 |
| return result |
|
|
|
|
| |
|
|
| def preprocess(text: str) -> tuple[str, set]: |
| text, caps = _fix_all_caps(text) |
| text = _split_apostrophe(text) |
| return text, caps |
|
|
|
|
| def postprocess(tokens: list[dict], caps: set) -> list[dict]: |
| tokens = _restore_caps_tokens(tokens, caps) |
| tokens = _merge_apostrophe_tokens(tokens) |
| return tokens |
|
|