| """Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split.""" |
|
|
| from __future__ import annotations |
|
|
| import re |
| from pathlib import Path |
|
|
| TR_CHARS = set("Γ§ΔΔ±ΕΓΆΓΌΓΔΔ°ΕΓΓ") |
|
|
| _PROPER_NOUNS: set[str] | None = None |
|
|
|
|
| def _load_proper_nouns() -> set[str]: |
| global _PROPER_NOUNS |
| if _PROPER_NOUNS is not None: |
| return _PROPER_NOUNS |
| path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt" |
| if path.exists(): |
| _PROPER_NOUNS = { |
| line.strip().lower() |
| for line in path.read_text(encoding="utf-8").splitlines() |
| if line.strip() and not line.startswith("#") |
| } |
| else: |
| _PROPER_NOUNS = set() |
| return _PROPER_NOUNS |
|
|
|
|
| def _turkish_lower(s: str) -> str: |
| """Turkish-aware lowercase: Δ°βi, IβΔ± (not i), then standard lower.""" |
| return s.replace("Δ°", "i").replace("I", "Δ±").lower() |
|
|
|
|
| TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted( |
| [ |
| "nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten", |
| "da","de","ta","te","ya","ye","nda","nde", |
| "yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ", |
| "lar","ler","lara","lere","larΔ±","leri", |
| "Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn", |
| "mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz", |
| "dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr", |
| "ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz", |
| "a","e","Δ±","i","u","ΓΌ", |
| ], |
| key=len, |
| reverse=True, |
| ) |
|
|
| _APO_RE = re.compile( |
| r"([A-Za-zΓΓ§ΔΔΔ°Δ±ΓΓΆΕΕΓΓΌ0-9]{2,})['\u2019]([A-Za-zΓΓ§ΔΔΔ°Δ±ΓΓΆΕΕΓΓΌ]{1,6})\b" |
| ) |
| _CAPS_RE = re.compile(r'\b([A-ZΓΔΔ°ΓΕΓ]{2,})\b') |
|
|
|
|
| def _is_turkish_base(word: str) -> bool: |
| """Return True if the word should be treated as Turkish (don't split apostrophe).""" |
| wl = _turkish_lower(word) |
| |
| if any(c in TR_CHARS for c in wl): |
| return True |
| |
| if wl in _load_proper_nouns(): |
| return True |
| |
| from ._tdk_vocab import load_tdk_words |
| tdk = load_tdk_words() |
| if tdk and wl in tdk: |
| return True |
| |
| try: |
| from ._root_validator import _morphology, ZEMBEREK_AVAILABLE |
| if ZEMBEREK_AVAILABLE and _morphology: |
| for analysis in _morphology.analyze(wl): |
| lemma = str(analysis).split("]")[0].lstrip("[") |
| if any(c in TR_CHARS for c in lemma): |
| return True |
| except Exception: |
| pass |
| |
| return len(wl) < 4 |
|
|
|
|
| |
|
|
| def _fix_all_caps(text: str) -> tuple[str, set]: |
| caps: set[str] = set() |
|
|
| def _replace(m: re.Match) -> str: |
| w = m.group(1) |
| caps.add(_turkish_lower(w)) |
| return _turkish_lower(w) |
|
|
| return _CAPS_RE.sub(_replace, text), caps |
|
|
|
|
| def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]: |
| result: list[dict] = [] |
| i = 0 |
| while i < len(tokens): |
| tok = tokens[i] |
| raw_low = _turkish_lower(tok["token"].strip()) |
|
|
| if tok["type"] == "ROOT" and raw_low in caps: |
| result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True}) |
| result.append(tok) |
| i += 1 |
| continue |
|
|
| if tok["type"] == "BPE" and tok["token"].startswith(" "): |
| combined = raw_low |
| lookahead = [tok] |
| j = i + 1 |
| while j < len(tokens): |
| nt = tokens[j] |
| if not nt["token"].startswith(" "): |
| combined += _turkish_lower(nt["token"].strip()) |
| lookahead.append(nt) |
| j += 1 |
| if combined in caps: |
| break |
| if len(combined) > 8: |
| break |
| else: |
| break |
| if combined in caps: |
| result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True}) |
| result.append({"token": f" {combined}", "type": "ROOT", |
| "_acronym": True, "_caps": True}) |
| i = j |
| continue |
|
|
| result.append(tok) |
| i += 1 |
|
|
| return result |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]: |
| """ |
| Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe β space). |
| Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]). |
| Turkish proper names (Δ°stanbul'da) are left unchanged. |
| """ |
| splits: list[tuple[str, str]] = [] |
|
|
| def _repl(m: re.Match) -> str: |
| base, suffix = m.group(1), m.group(2) |
| if _is_turkish_base(base): |
| return m.group(0) |
| sl = suffix.lower() |
| if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE): |
| splits.append((_turkish_lower(base), sl)) |
| return f"{base} {suffix}" |
| return m.group(0) |
|
|
| return _APO_RE.sub(_repl, text), splits |
|
|
|
|
| def _merge_apostrophe_tokens( |
| tokens: list[dict], apo_splits: list[tuple[str, str]] |
| ) -> list[dict]: |
| """ |
| For each (foreign_base, suffix) pair recorded during _split_apostrophe, |
| find the consecutive BPE/ROOT pieces that together spell foreign_base, |
| merge them into one FOREIGN ROOT token, and mark the next word-initial |
| token whose stripped form == suffix as SUFFIX. |
| """ |
| if not apo_splits: |
| return tokens |
|
|
| result = list(tokens) |
|
|
| for foreign_base, suffix in apo_splits: |
| n = len(result) |
| for j in range(1, n): |
| tok_j = result[j] |
| |
| if not tok_j["token"].startswith(" "): |
| continue |
| if _turkish_lower(tok_j["token"].strip()) != suffix: |
| continue |
|
|
| |
| word_start = j - 1 |
| while word_start > 0 and not result[word_start]["token"].startswith(" "): |
| word_start -= 1 |
|
|
| pieces = result[word_start:j] |
| if not pieces: |
| continue |
|
|
| combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces) |
| if combined != foreign_base: |
| continue |
|
|
| |
| merged = pieces[0]["token"] |
| for p in pieces[1:]: |
| merged += p["token"].strip() |
|
|
| new_root = {"token": merged, "type": "ROOT", "_foreign": True} |
| new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True} |
|
|
| result = ( |
| result[:word_start] |
| + [new_root, new_suf] |
| + result[j + 1:] |
| ) |
| break |
|
|
| return result |
|
|
|
|
| |
|
|
| def preprocess(text: str) -> tuple[str, set, list]: |
| """Prepare text before base tokenization. |
| |
| Returns: |
| (modified_text, caps_set, apo_splits) |
| """ |
| text, caps = _fix_all_caps(text) |
| text, apo_splits = _split_apostrophe(text) |
| return text, caps, apo_splits |
|
|
|
|
| def postprocess( |
| tokens: list[dict], caps: set, apo_splits: list | None = None |
| ) -> list[dict]: |
| """Fix tokens after base tokenization.""" |
| tokens = _restore_caps_tokens(tokens, caps) |
| tokens = _merge_apostrophe_tokens(tokens, apo_splits or []) |
| return tokens |
|
|