Rename project from TurkTokenizer to NedoTurkishTokenizer

cfffd93 3 days ago

8.86 kB

	"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""

	from __future__ import annotations

	import re
	from pathlib import Path

	TR_CHARS = set("çğışöüÇĞİŞÖÜ")

	_PROPER_NOUNS: set[str] \| None = None


	def _load_proper_nouns() -> set[str]:
	global _PROPER_NOUNS
	if _PROPER_NOUNS is not None:
	return _PROPER_NOUNS
	path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
	if path.exists():
	_PROPER_NOUNS = {
	line.strip().lower()
	for line in path.read_text(encoding="utf-8").splitlines()
	if line.strip() and not line.startswith("#")
	}
	else:
	_PROPER_NOUNS = set()
	return _PROPER_NOUNS


	def _turkish_lower(s: str) -> str:
	"""Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
	return s.replace("İ", "i").replace("I", "ı").lower()


	TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
	[
	"nın","nin","nun","nün","dan","den","tan","ten",
	"da","de","ta","te","ya","ye","nda","nde",
	"yı","yi","yu","yü","nı","ni","nu","nü",
	"lar","ler","lara","lere","ları","leri",
	"ım","im","um","üm","ın","in","un","ün",
	"mız","miz","muz","müz","nız","niz","nuz","nüz",
	"dır","dir","dur","dür","tır","tir","tur","tür",
	"ki","li","lı","lu","lü","sız","siz","suz","süz",
	"a","e","ı","i","u","ü",
	],
	key=len,
	reverse=True,
	)

	_APO_RE = re.compile(
	r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
	)
	_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')


	def _is_turkish_base(word: str) -> bool:
	"""Return True if the word should be treated as Turkish (don't split apostrophe)."""
	wl = _turkish_lower(word)
	# Fast path: Turkish-specific characters → definitely Turkish
	if any(c in TR_CHARS for c in wl):
	return True
	# Turkish proper nouns (cities, regions) — not in TDK common-word list
	if wl in _load_proper_nouns():
	return True
	# TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
	from ._tdk_vocab import load_tdk_words # noqa: PLC0415
	tdk = load_tdk_words()
	if tdk and wl in tdk:
	return True
	# Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…)
	try:
	from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
	if ZEMBEREK_AVAILABLE and _morphology:
	for analysis in _morphology.analyze(wl):
	lemma = str(analysis).split("]")[0].lstrip("[")
	if any(c in TR_CHARS for c in lemma):
	return True
	except Exception: # noqa: BLE001
	pass
	# TDK unavailable + Zemberek unavailable: very short words are ambiguous
	return len(wl) < 4


	# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────

	def _fix_all_caps(text: str) -> tuple[str, set]:
	caps: set[str] = set()

	def _replace(m: re.Match) -> str:
	w = m.group(1)
	caps.add(_turkish_lower(w))
	return _turkish_lower(w)

	return _CAPS_RE.sub(_replace, text), caps


	def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
	result: list[dict] = []
	i = 0
	while i < len(tokens):
	tok = tokens[i]
	raw_low = _turkish_lower(tok["token"].strip())

	if tok["type"] == "ROOT" and raw_low in caps:
	result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
	result.append(tok)
	i += 1
	continue

	if tok["type"] == "BPE" and tok["token"].startswith(" "):
	combined = raw_low
	lookahead = [tok]
	j = i + 1
	while j < len(tokens):
	nt = tokens[j]
	if not nt["token"].startswith(" "):
	combined += _turkish_lower(nt["token"].strip())
	lookahead.append(nt)
	j += 1
	if combined in caps:
	break
	if len(combined) > 8:
	break
	else:
	break
	if combined in caps:
	result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
	result.append({"token": f" {combined}", "type": "ROOT",
	"_acronym": True, "_caps": True})
	i = j
	continue

	result.append(tok)
	i += 1

	return result


	# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
	#
	# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
	# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
	# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
	# then marks the following word-initial suffix token as SUFFIX.
	#
	# Old approach used a \ue001 separator — the base tokenizer converts that to
	# '<unknown>' so the separator was never found. Simple-space + pair-list is
	# robust regardless of how the tokenizer handles the input.

	def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
	"""
	Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space).
	Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
	Turkish proper names (İstanbul'da) are left unchanged.
	"""
	splits: list[tuple[str, str]] = []

	def _repl(m: re.Match) -> str:
	base, suffix = m.group(1), m.group(2)
	if _is_turkish_base(base):
	return m.group(0) # leave Turkish names alone
	sl = suffix.lower()
	if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
	splits.append((_turkish_lower(base), sl))
	return f"{base} {suffix}" # just drop the apostrophe
	return m.group(0)

	return _APO_RE.sub(_repl, text), splits


	def _merge_apostrophe_tokens(
	tokens: list[dict], apo_splits: list[tuple[str, str]]
	) -> list[dict]:
	"""
	For each (foreign_base, suffix) pair recorded during _split_apostrophe,
	find the consecutive BPE/ROOT pieces that together spell foreign_base,
	merge them into one FOREIGN ROOT token, and mark the next word-initial
	token whose stripped form == suffix as SUFFIX.
	"""
	if not apo_splits:
	return tokens

	result = list(tokens)

	for foreign_base, suffix in apo_splits:
	n = len(result)
	for j in range(1, n):
	tok_j = result[j]
	# Candidate suffix token: word-initial, stripped == suffix
	if not tok_j["token"].startswith(" "):
	continue
	if _turkish_lower(tok_j["token"].strip()) != suffix:
	continue

	# Walk back to find pieces of the word before j (no leading space)
	word_start = j - 1
	while word_start > 0 and not result[word_start]["token"].startswith(" "):
	word_start -= 1

	pieces = result[word_start:j]
	if not pieces:
	continue

	combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
	if combined != foreign_base:
	continue

	# Merge pieces into one FOREIGN ROOT
	merged = pieces[0]["token"] # keeps leading space
	for p in pieces[1:]:
	merged += p["token"].strip()

	new_root = {"token": merged, "type": "ROOT", "_foreign": True}
	new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}

	result = (
	result[:word_start]
	+ [new_root, new_suf]
	+ result[j + 1:]
	)
	break # this pair is handled

	return result


	# ── Combined pre / post ───────────────────────────────────────────────────────

	def preprocess(text: str) -> tuple[str, set, list]:
	"""Prepare text before base tokenization.

	Returns:
	(modified_text, caps_set, apo_splits)
	"""
	text, caps = _fix_all_caps(text)
	text, apo_splits = _split_apostrophe(text)
	return text, caps, apo_splits


	def postprocess(
	tokens: list[dict], caps: set, apo_splits: list \| None = None
	) -> list[dict]:
	"""Fix tokens after base tokenization."""
	tokens = _restore_caps_tokens(tokens, caps)
	tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
	return tokens