nmstech's picture
Rename project from TurkTokenizer to NedoTurkishTokenizer
cfffd93
"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).
Uses a segment-based approach: special tokens are detected and extracted
*before* the base tokenizer runs, so they never pass through it.
"""
from __future__ import annotations
import re
MONTH_NAMES = {
"ocak","şubat","mart","nisan","mayıs","haziran",
"temmuz","ağustos","eylül","ekim","kasım","aralık",
"january","february","march","april","may","june",
"july","august","september","october","november","december",
}
UNITS = {
"km","m","cm","mm","nm",
"kg","g","mg","ton",
"sn","dk","sa","ms",
"tl","usd","eur","gbp",
"kb","mb","gb","tb","pb",
"ml","mcg","meq","iu","mmhg","mosm",
"hz","mhz","ghz","watt","kw","mw","kcal","cal",
}
ROMAN_NUMERALS = {
"i","ii","iii","iv","vi","vii","viii","ix",
"xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
}
# ── Regex patterns ────────────────────────────────────────────────────────────
URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+')
HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+')
# Turkish suffixes that can follow a number+apostrophe
_NUM_SUFFIXES = sorted(
[
"nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten",
"da","de","ta","te","ya","ye","nda","nde",
"yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ",
"lar","ler","lara","lere","larΔ±","leri",
"Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn",
"mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz",
"dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr",
"ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz",
"inci","Δ±ncΔ±","uncu","ΓΌncΓΌ","nci","ncΔ±",
"lΔ±k","lik","luk","lΓΌk",
"a","e","Δ±","i","u","ΓΌ",
],
key=len,
reverse=True,
)
_SUFFIX_ALT = '|'.join(re.escape(s) for s in _NUM_SUFFIXES)
# Number (or time) followed by apostrophe + Turkish suffix(es)
NUM_APOSTROPHE_RE = re.compile(
r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
re.IGNORECASE,
)
DATE_RE = re.compile(
r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
)
CURRENCY_RE = re.compile(r'[$€£Β₯β‚Ίβ‚½]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£Β₯β‚Ίβ‚½]')
NUMBER_RE = re.compile(
r'%\d+[\.,]?\d*'
r'|\d{1,3}(?:\.\d{3})+' # thousands (1.000.000) β€” before decimal!
r'|\d+[\.,]\d+' # decimal (2.5, 10,5)
r'|\d+%'
r'|\d+/\d+'
)
TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
PLAIN_NUM_RE = re.compile(r'\b\d+\b')
# ── Acronym patterns ─────────────────────────────────────────────────────────
# Matches standalone uppercase sequences (+ optional trailing digits).
# [A-Z]{2,}[0-9]* β†’ HTML, GPT, CSS3, HTML5, MP3
# [A-Z][0-9]+ β†’ F16, H264, A4
# Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
ACRONYM_RE = re.compile(
r"\b[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,}[0-9]*\b"
r"|\b[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ][0-9]+\b"
)
# Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten
ACRONYM_APOSTROPHE_RE = re.compile(
r"\b(?:[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,}[0-9]*|[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ][0-9]+)['\u2019](?:"
+ _SUFFIX_ALT + r")+\b"
)
TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
UNICODE_EMOJI_RE = re.compile(
"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
"\U00002700-\U000027BF\U0001F900-\U0001F9FF"
"\U00002600-\U000026FF]+",
flags=re.UNICODE,
)
# Pattern priority: earlier entries win when spans overlap.
_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
(URL_RE, "URL"),
(MENTION_RE, "MENTION"),
(HASHTAG_RE, "HASHTAG"),
(DATE_RE, "DATE"),
(CURRENCY_RE, "UNIT"),
(NUM_APOSTROPHE_RE, "NUM_APO"),
(ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"),
(ACRONYM_RE, "ACRONYM"),
(NUMBER_RE, "NUM"),
(TIME_RE, "NUM"),
(PLAIN_NUM_RE, "NUM"),
(UNICODE_EMOJI_RE, "EMOJI"),
(TEXT_EMOJI_RE, "EMOJI"),
]
# ── Acronym vs Turkish word disambiguation ───────────────────────────────────
def _is_known_turkish_word(word_upper: str) -> bool:
"""Return True if *word_upper* (ALL CAPS) is a known Turkish word.
Checks (in order):
1. ACRONYM_EXPANSIONS dict β†’ always acronym (return False)
2. Same dict without trailing digits (HTML5 β†’ HTML)
3. TDK dictionary β†’ Turkish word (return True)
4. Proper nouns list β†’ Turkish word (return True)
5. Otherwise β†’ treat as acronym (return False)
"""
from ._acronym_dict import ACRONYM_EXPANSIONS # noqa: PLC0415
from ._preprocessor import _turkish_lower, _load_proper_nouns # noqa: PLC0415
from ._tdk_vocab import load_tdk_words # noqa: PLC0415
# Known acronyms always win
if word_upper in ACRONYM_EXPANSIONS:
return False
# Also check without trailing digits (HTML5 β†’ HTML)
base = word_upper.rstrip("0123456789")
if base and base != word_upper and base in ACRONYM_EXPANSIONS:
return False
wl = _turkish_lower(word_upper)
# TDK dictionary: if the lowercase form is a real Turkish word β†’ not acronym
tdk = load_tdk_words()
if tdk and wl in tdk:
return True
# Proper nouns (Δ°stanbul, Ankara…)
if wl in _load_proper_nouns():
return True
return False
# ── Segment-based API ────────────────────────────────────────────────────────
def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
"""Find all special-token spans in *text*.
Returns a sorted, non-overlapping list of
``(start, end, token_type, original_text)``.
"""
candidates: list[tuple[int, int, str, str]] = []
for pattern, ttype in _SPAN_PATTERNS:
for m in pattern.finditer(text):
original = m.group(0)
# Acronym filtering: skip if it's actually a Turkish word
if ttype in ("ACRONYM", "ACRONYM_APO"):
# Extract the uppercase base (before apostrophe for APO)
if ttype == "ACRONYM_APO":
apo = original.find("'")
if apo == -1:
apo = original.find("\u2019")
acr_base = original[:apo]
else:
acr_base = original
if _is_known_turkish_word(acr_base):
continue
candidates.append((m.start(), m.end(), ttype, original))
# Sort by start position, then prefer longer match
candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
# Greedy non-overlapping selection
result: list[tuple[int, int, str, str]] = []
last_end = 0
for s, e, t, o in candidates:
if s >= last_end:
result.append((s, e, t, o))
last_end = e
return result
def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]:
"""Split a suffix string (after apostrophe) into individual SUFFIX tokens."""
tokens: list[dict] = []
remaining = suffix_str.lower()
while remaining:
matched = False
for s in _NUM_SUFFIXES:
if remaining.startswith(s):
tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
remaining = remaining[len(s):]
matched = True
break
if not matched:
tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
break
return tokens
def make_special_tokens(span_type: str, original: str) -> list[dict]:
"""Create token dict(s) for a matched special span.
``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens.
"""
# ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
if span_type == "NUM_APO":
apo_pos = original.find("'")
if apo_pos == -1:
apo_pos = original.find("\u2019")
num_part = original[:apo_pos]
return [
{"token": f" {num_part}", "type": "NUM", "_num": True},
*_split_apostrophe_suffixes(original[apo_pos + 1:]),
]
# ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
if span_type == "ACRONYM_APO":
apo_pos = original.find("'")
if apo_pos == -1:
apo_pos = original.find("\u2019")
acr_part = original[:apo_pos]
return [
{"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True},
*_split_apostrophe_suffixes(original[apo_pos + 1:]),
]
# ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
if span_type == "ACRONYM":
return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}]
# ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
return [{
"token": f" {original}",
"type": span_type,
f"_{span_type.lower()}": True,
}]
# ── Safety-net post-pass ─────────────────────────────────────────────────────
def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
"""Catch remaining number/unit tokens missed by span detection."""
result: list[dict] = []
for tok in tokens:
if tok["type"] not in ("BPE", "ROOT"):
result.append(tok)
continue
raw = tok["token"].strip()
if NUMBER_RE.fullmatch(raw):
result.append({**tok, "type": "NUM", "_num": True})
elif raw.lower() in UNITS and tok["type"] == "BPE":
result.append({**tok, "type": "UNIT", "_unit": True})
elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
result.append({**tok, "type": "NUM", "_roman": True})
elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
result.append({**tok, "type": "ROOT", "_month": True})
else:
result.append(tok)
return result