Rename project from TurkTokenizer to NedoTurkishTokenizer

cfffd93 18 days ago

10.7 kB

	"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).

	Uses a segment-based approach: special tokens are detected and extracted
	before the base tokenizer runs, so they never pass through it.
	"""

	from __future__ import annotations

	import re

	MONTH_NAMES = {
	"ocak","şubat","mart","nisan","mayıs","haziran",
	"temmuz","ağustos","eylül","ekim","kasım","aralık",
	"january","february","march","april","may","june",
	"july","august","september","october","november","december",
	}

	UNITS = {
	"km","m","cm","mm","nm",
	"kg","g","mg","ton",
	"sn","dk","sa","ms",
	"tl","usd","eur","gbp",
	"kb","mb","gb","tb","pb",
	"ml","mcg","meq","iu","mmhg","mosm",
	"hz","mhz","ghz","watt","kw","mw","kcal","cal",
	}

	ROMAN_NUMERALS = {
	"i","ii","iii","iv","vi","vii","viii","ix",
	"xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
	}

	# ── Regex patterns ────────────────────────────────────────────────────────────

	URL_RE = re.compile(r'https?://\S+\|www\.\S+', re.IGNORECASE)
	MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+')
	HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+')

	# Turkish suffixes that can follow a number+apostrophe
	_NUM_SUFFIXES = sorted(
	[
	"nın","nin","nun","nün","dan","den","tan","ten",
	"da","de","ta","te","ya","ye","nda","nde",
	"yı","yi","yu","yü","nı","ni","nu","nü",
	"lar","ler","lara","lere","ları","leri",
	"ım","im","um","üm","ın","in","un","ün",
	"mız","miz","muz","müz","nız","niz","nuz","nüz",
	"dır","dir","dur","dür","tır","tir","tur","tür",
	"ki","li","lı","lu","lü","sız","siz","suz","süz",
	"inci","ıncı","uncu","üncü","nci","ncı",
	"lık","lik","luk","lük",
	"a","e","ı","i","u","ü",
	],
	key=len,
	reverse=True,
	)

	_SUFFIX_ALT = '\|'.join(re.escape(s) for s in _NUM_SUFFIXES)

	# Number (or time) followed by apostrophe + Turkish suffix(es)
	NUM_APOSTROPHE_RE = re.compile(
	r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
	re.IGNORECASE,
	)

	DATE_RE = re.compile(
	r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
	r'\|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
	)
	CURRENCY_RE = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d\|\d+[\.,]?\d[$€£¥₺₽]')
	NUMBER_RE = re.compile(
	r'%\d+[\.,]?\d*'
	r'\|\d{1,3}(?:\.\d{3})+' # thousands (1.000.000) — before decimal!
	r'\|\d+[\.,]\d+' # decimal (2.5, 10,5)
	r'\|\d+%'
	r'\|\d+/\d+'
	)
	TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
	PLAIN_NUM_RE = re.compile(r'\b\d+\b')

	# ── Acronym patterns ─────────────────────────────────────────────────────────
	# Matches standalone uppercase sequences (+ optional trailing digits).
	# [A-Z]{2,}[0-9]* → HTML, GPT, CSS3, HTML5, MP3
	# [A-Z][0-9]+ → F16, H264, A4
	# Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
	ACRONYM_RE = re.compile(
	r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b"
	r"\|\b[A-ZÇĞİÖŞÜ][0-9]+\b"
	)

	# Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten
	ACRONYM_APOSTROPHE_RE = re.compile(
	r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*\|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:"
	+ _SUFFIX_ALT + r")+\b"
	)

	TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]\|<3')
	UNICODE_EMOJI_RE = re.compile(
	"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
	"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
	"\U00002700-\U000027BF\U0001F900-\U0001F9FF"
	"\U00002600-\U000026FF]+",
	flags=re.UNICODE,
	)

	# Pattern priority: earlier entries win when spans overlap.
	_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
	(URL_RE, "URL"),
	(MENTION_RE, "MENTION"),
	(HASHTAG_RE, "HASHTAG"),
	(DATE_RE, "DATE"),
	(CURRENCY_RE, "UNIT"),
	(NUM_APOSTROPHE_RE, "NUM_APO"),
	(ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"),
	(ACRONYM_RE, "ACRONYM"),
	(NUMBER_RE, "NUM"),
	(TIME_RE, "NUM"),
	(PLAIN_NUM_RE, "NUM"),
	(UNICODE_EMOJI_RE, "EMOJI"),
	(TEXT_EMOJI_RE, "EMOJI"),
	]


	# ── Acronym vs Turkish word disambiguation ───────────────────────────────────

	def _is_known_turkish_word(word_upper: str) -> bool:
	"""Return True if word_upper (ALL CAPS) is a known Turkish word.

	Checks (in order):
	1. ACRONYM_EXPANSIONS dict → always acronym (return False)
	2. Same dict without trailing digits (HTML5 → HTML)
	3. TDK dictionary → Turkish word (return True)
	4. Proper nouns list → Turkish word (return True)
	5. Otherwise → treat as acronym (return False)
	"""
	from ._acronym_dict import ACRONYM_EXPANSIONS # noqa: PLC0415
	from ._preprocessor import _turkish_lower, _load_proper_nouns # noqa: PLC0415
	from ._tdk_vocab import load_tdk_words # noqa: PLC0415

	# Known acronyms always win
	if word_upper in ACRONYM_EXPANSIONS:
	return False
	# Also check without trailing digits (HTML5 → HTML)
	base = word_upper.rstrip("0123456789")
	if base and base != word_upper and base in ACRONYM_EXPANSIONS:
	return False

	wl = _turkish_lower(word_upper)

	# TDK dictionary: if the lowercase form is a real Turkish word → not acronym
	tdk = load_tdk_words()
	if tdk and wl in tdk:
	return True

	# Proper nouns (İstanbul, Ankara…)
	if wl in _load_proper_nouns():
	return True

	return False


	# ── Segment-based API ────────────────────────────────────────────────────────

	def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
	"""Find all special-token spans in text.

	Returns a sorted, non-overlapping list of
	``(start, end, token_type, original_text)``.
	"""
	candidates: list[tuple[int, int, str, str]] = []
	for pattern, ttype in _SPAN_PATTERNS:
	for m in pattern.finditer(text):
	original = m.group(0)

	# Acronym filtering: skip if it's actually a Turkish word
	if ttype in ("ACRONYM", "ACRONYM_APO"):
	# Extract the uppercase base (before apostrophe for APO)
	if ttype == "ACRONYM_APO":
	apo = original.find("'")
	if apo == -1:
	apo = original.find("\u2019")
	acr_base = original[:apo]
	else:
	acr_base = original
	if _is_known_turkish_word(acr_base):
	continue

	candidates.append((m.start(), m.end(), ttype, original))

	# Sort by start position, then prefer longer match
	candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))

	# Greedy non-overlapping selection
	result: list[tuple[int, int, str, str]] = []
	last_end = 0
	for s, e, t, o in candidates:
	if s >= last_end:
	result.append((s, e, t, o))
	last_end = e
	return result


	def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]:
	"""Split a suffix string (after apostrophe) into individual SUFFIX tokens."""
	tokens: list[dict] = []
	remaining = suffix_str.lower()
	while remaining:
	matched = False
	for s in _NUM_SUFFIXES:
	if remaining.startswith(s):
	tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
	remaining = remaining[len(s):]
	matched = True
	break
	if not matched:
	tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
	break
	return tokens


	def make_special_tokens(span_type: str, original: str) -> list[dict]:
	"""Create token dict(s) for a matched special span.

	``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens.
	"""
	# ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
	if span_type == "NUM_APO":
	apo_pos = original.find("'")
	if apo_pos == -1:
	apo_pos = original.find("\u2019")
	num_part = original[:apo_pos]
	return [
	{"token": f" {num_part}", "type": "NUM", "_num": True},
	*_split_apostrophe_suffixes(original[apo_pos + 1:]),
	]

	# ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
	if span_type == "ACRONYM_APO":
	apo_pos = original.find("'")
	if apo_pos == -1:
	apo_pos = original.find("\u2019")
	acr_part = original[:apo_pos]
	return [
	{"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True},
	*_split_apostrophe_suffixes(original[apo_pos + 1:]),
	]

	# ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
	if span_type == "ACRONYM":
	return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}]

	# ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
	return [{
	"token": f" {original}",
	"type": span_type,
	f"_{span_type.lower()}": True,
	}]


	# ── Safety-net post-pass ─────────────────────────────────────────────────────

	def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
	"""Catch remaining number/unit tokens missed by span detection."""
	result: list[dict] = []
	for tok in tokens:
	if tok["type"] not in ("BPE", "ROOT"):
	result.append(tok)
	continue

	raw = tok["token"].strip()

	if NUMBER_RE.fullmatch(raw):
	result.append({**tok, "type": "NUM", "_num": True})
	elif raw.lower() in UNITS and tok["type"] == "BPE":
	result.append({**tok, "type": "UNIT", "_unit": True})
	elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
	result.append({**tok, "type": "NUM", "_roman": True})
	elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
	result.append({**tok, "type": "ROOT", "_month": True})
	else:
	result.append(tok)

	return result