"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split.""" from __future__ import annotations import re TR_CHARS = set("çğışöüÇĞİŞÖÜ") KNOWN_TURKISH_BASES = { "istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi", "cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz", "temmuz", "ocak", "şubat", "mart", "nisan", "mayıs", "haziran", "ağustos", "eylül", "ekim", "kasım", "aralık", } KNOWN_FOREIGN_BASES = { "python", "zoom", "google", "github", "twitter", "youtube", "instagram", "linkedin", "facebook", "whatsapp", "telegram", "numpy", "pandas", "django", "flask", "react", "javascript", "typescript", "docker", "linux", "windows", "android", "iphone", "chatgpt", "openai", "claude", "gemini", "llama", "bert", "excel", "powerpoint", "outlook", "teams", "slack", "notion", "spotify", "netflix", "amazon", "alibaba", "huawei", "samsung", } TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted( [ "nın","nin","nun","nün","dan","den","tan","ten", "da","de","ta","te","ya","ye","nda","nde", "yı","yi","yu","yü","nı","ni","nu","nü", "lar","ler","lara","lere","ları","leri", "ım","im","um","üm","ın","in","un","ün", "mız","miz","muz","müz","nız","niz","nuz","nüz", "dır","dir","dur","dür","tır","tir","tur","tür", "ki","li","lı","lu","lü","sız","siz","suz","süz", "a","e","ı","i","u","ü", ], key=len, reverse=True, ) _APO_SEP = "\ue001" _APO_RE = re.compile( r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b" ) _CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b') def _is_turkish_base(word: str) -> bool: w = word.lower() if w in KNOWN_FOREIGN_BASES: return False if any(c in TR_CHARS for c in word): return True if w in KNOWN_TURKISH_BASES: return True if len(w) < 4: return True return False # ── Fix 1: ALL CAPS ─────────────────────────────────────────────────────────── def _fix_all_caps(text: str) -> tuple[str, set]: caps: set[str] = set() def _replace(m: re.Match) -> str: w = m.group(1) caps.add(w.lower()) return w.lower() return _CAPS_RE.sub(_replace, text), caps def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]: result: list[dict] = [] i = 0 while i < len(tokens): tok = tokens[i] raw_low = tok["token"].strip().lower() if tok["type"] == "ROOT" and raw_low in caps: result.append({"token": "", "type": "ROOT", "_caps": True}) result.append(tok) i += 1 continue if tok["type"] == "BPE" and tok["token"].startswith(" "): combined = raw_low lookahead = [tok] j = i + 1 while j < len(tokens): nt = tokens[j] if not nt["token"].startswith(" "): combined += nt["token"].strip().lower() lookahead.append(nt) j += 1 if combined in caps: break if len(combined) > 8: break else: break if combined in caps: result.append({"token": "", "type": "ROOT", "_caps": True}) result.append({"token": f" {combined}", "type": "ROOT", "_acronym": True, "_caps": True}) i = j continue result.append(tok) i += 1 return result # ── Fix 2: Apostrophe split ─────────────────────────────────────────────────── def _split_apostrophe(text: str) -> str: def _repl(m: re.Match) -> str: base, suffix = m.group(1), m.group(2) if _is_turkish_base(base): return m.group(0) if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE): return f"{base} {_APO_SEP} {suffix}" return m.group(0) return _APO_RE.sub(_repl, text) def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]: result: list[dict] = [] i = 0 while i < len(tokens): tok = tokens[i] if _APO_SEP in tok["token"].strip(): if result: result[-1]["type"] = "ROOT" result[-1]["_foreign"] = True i += 1 if i < len(tokens): tokens[i]["type"] = "SUFFIX" tokens[i]["_apo_suffix"] = True result.append(tokens[i]) i += 1 else: result.append(tok) i += 1 return result # ── Combined pre / post ─────────────────────────────────────────────────────── def preprocess(text: str) -> tuple[str, set]: text, caps = _fix_all_caps(text) text = _split_apostrophe(text) return text, caps def postprocess(tokens: list[dict], caps: set) -> list[dict]: tokens = _restore_caps_tokens(tokens, caps) tokens = _merge_apostrophe_tokens(tokens) return tokens