"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split.""" from __future__ import annotations import re from pathlib import Path TR_CHARS = set("çğışöüÇĞİŞÖÜ") _PROPER_NOUNS: set[str] | None = None def _load_proper_nouns() -> set[str]: global _PROPER_NOUNS if _PROPER_NOUNS is not None: return _PROPER_NOUNS path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt" if path.exists(): _PROPER_NOUNS = { line.strip().lower() for line in path.read_text(encoding="utf-8").splitlines() if line.strip() and not line.startswith("#") } else: _PROPER_NOUNS = set() return _PROPER_NOUNS def _turkish_lower(s: str) -> str: """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower.""" return s.replace("İ", "i").replace("I", "ı").lower() TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted( [ "nın","nin","nun","nün","dan","den","tan","ten", "da","de","ta","te","ya","ye","nda","nde", "yı","yi","yu","yü","nı","ni","nu","nü", "lar","ler","lara","lere","ları","leri", "ım","im","um","üm","ın","in","un","ün", "mız","miz","muz","müz","nız","niz","nuz","nüz", "dır","dir","dur","dür","tır","tir","tur","tür", "ki","li","lı","lu","lü","sız","siz","suz","süz", "a","e","ı","i","u","ü", ], key=len, reverse=True, ) _APO_RE = re.compile( r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b" ) _CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b') def _is_turkish_base(word: str) -> bool: """Return True if the word should be treated as Turkish (don't split apostrophe).""" wl = _turkish_lower(word) # Fast path: Turkish-specific characters → definitely Turkish if any(c in TR_CHARS for c in wl): return True # Turkish proper nouns (cities, regions) — not in TDK common-word list if wl in _load_proper_nouns(): return True # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword) from ._tdk_vocab import load_tdk_words # noqa: PLC0415 tdk = load_tdk_words() if tdk and wl in tdk: return True # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…) try: from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415 if ZEMBEREK_AVAILABLE and _morphology: for analysis in _morphology.analyze(wl): lemma = str(analysis).split("]")[0].lstrip("[") if any(c in TR_CHARS for c in lemma): return True except Exception: # noqa: BLE001 pass # TDK unavailable + Zemberek unavailable: very short words are ambiguous return len(wl) < 4 # ── Fix 1: ALL CAPS ─────────────────────────────────────────────────────────── def _fix_all_caps(text: str) -> tuple[str, set]: caps: set[str] = set() def _replace(m: re.Match) -> str: w = m.group(1) caps.add(_turkish_lower(w)) return _turkish_lower(w) return _CAPS_RE.sub(_replace, text), caps def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]: result: list[dict] = [] i = 0 while i < len(tokens): tok = tokens[i] raw_low = _turkish_lower(tok["token"].strip()) if tok["type"] == "ROOT" and raw_low in caps: result.append({"token": "", "type": "ROOT", "_caps": True}) result.append(tok) i += 1 continue if tok["type"] == "BPE" and tok["token"].startswith(" "): combined = raw_low lookahead = [tok] j = i + 1 while j < len(tokens): nt = tokens[j] if not nt["token"].startswith(" "): combined += _turkish_lower(nt["token"].strip()) lookahead.append(nt) j += 1 if combined in caps: break if len(combined) > 8: break else: break if combined in caps: result.append({"token": "", "type": "ROOT", "_caps": True}) result.append({"token": f" {combined}", "type": "ROOT", "_acronym": True, "_caps": True}) i = j continue result.append(tok) i += 1 return result # ── Fix 2: Apostrophe split ─────────────────────────────────────────────────── # # Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space. # After tokenization, _merge_apostrophe_tokens uses these pairs to find the # BPE pieces that form the foreign word and merge them into one FOREIGN ROOT, # then marks the following word-initial suffix token as SUFFIX. # # Old approach used a \ue001 separator — the base tokenizer converts that to # '' so the separator was never found. Simple-space + pair-list is # robust regardless of how the tokenizer handles the input. def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]: """ Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space). Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]). Turkish proper names (İstanbul'da) are left unchanged. """ splits: list[tuple[str, str]] = [] def _repl(m: re.Match) -> str: base, suffix = m.group(1), m.group(2) if _is_turkish_base(base): return m.group(0) # leave Turkish names alone sl = suffix.lower() if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE): splits.append((_turkish_lower(base), sl)) return f"{base} {suffix}" # just drop the apostrophe return m.group(0) return _APO_RE.sub(_repl, text), splits def _merge_apostrophe_tokens( tokens: list[dict], apo_splits: list[tuple[str, str]] ) -> list[dict]: """ For each (foreign_base, suffix) pair recorded during _split_apostrophe, find the consecutive BPE/ROOT pieces that together spell foreign_base, merge them into one FOREIGN ROOT token, and mark the next word-initial token whose stripped form == suffix as SUFFIX. """ if not apo_splits: return tokens result = list(tokens) for foreign_base, suffix in apo_splits: n = len(result) for j in range(1, n): tok_j = result[j] # Candidate suffix token: word-initial, stripped == suffix if not tok_j["token"].startswith(" "): continue if _turkish_lower(tok_j["token"].strip()) != suffix: continue # Walk back to find pieces of the word before j (no leading space) word_start = j - 1 while word_start > 0 and not result[word_start]["token"].startswith(" "): word_start -= 1 pieces = result[word_start:j] if not pieces: continue combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces) if combined != foreign_base: continue # Merge pieces into one FOREIGN ROOT merged = pieces[0]["token"] # keeps leading space for p in pieces[1:]: merged += p["token"].strip() new_root = {"token": merged, "type": "ROOT", "_foreign": True} new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True} result = ( result[:word_start] + [new_root, new_suf] + result[j + 1:] ) break # this pair is handled return result # ── Combined pre / post ─────────────────────────────────────────────────────── def preprocess(text: str) -> tuple[str, set, list]: """Prepare text before base tokenization. Returns: (modified_text, caps_set, apo_splits) """ text, caps = _fix_all_caps(text) text, apo_splits = _split_apostrophe(text) return text, caps, apo_splits def postprocess( tokens: list[dict], caps: set, apo_splits: list | None = None ) -> list[dict]: """Fix tokens after base tokenization.""" tokens = _restore_caps_tokens(tokens, caps) tokens = _merge_apostrophe_tokens(tokens, apo_splits or []) return tokens