"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI). Uses a segment-based approach: special tokens are detected and extracted *before* the base tokenizer runs, so they never pass through it. """ from __future__ import annotations import re MONTH_NAMES = { "ocak","şubat","mart","nisan","mayıs","haziran", "temmuz","ağustos","eylül","ekim","kasım","aralık", "january","february","march","april","may","june", "july","august","september","october","november","december", } UNITS = { "km","m","cm","mm","nm", "kg","g","mg","ton", "sn","dk","sa","ms", "tl","usd","eur","gbp", "kb","mb","gb","tb","pb", "ml","mcg","meq","iu","mmhg","mosm", "hz","mhz","ghz","watt","kw","mw","kcal","cal", } ROMAN_NUMERALS = { "i","ii","iii","iv","vi","vii","viii","ix", "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx", } # ── Regex patterns ──────────────────────────────────────────────────────────── URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE) MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+') HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+') # Turkish suffixes that can follow a number+apostrophe _NUM_SUFFIXES = sorted( [ "nın","nin","nun","nün","dan","den","tan","ten", "da","de","ta","te","ya","ye","nda","nde", "yı","yi","yu","yü","nı","ni","nu","nü", "lar","ler","lara","lere","ları","leri", "ım","im","um","üm","ın","in","un","ün", "mız","miz","muz","müz","nız","niz","nuz","nüz", "dır","dir","dur","dür","tır","tir","tur","tür", "ki","li","lı","lu","lü","sız","siz","suz","süz", "inci","ıncı","uncu","üncü","nci","ncı", "lık","lik","luk","lük", "a","e","ı","i","u","ü", ], key=len, reverse=True, ) _SUFFIX_ALT = '|'.join(re.escape(s) for s in _NUM_SUFFIXES) # Number (or time) followed by apostrophe + Turkish suffix(es) NUM_APOSTROPHE_RE = re.compile( r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b", re.IGNORECASE, ) DATE_RE = re.compile( r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}' r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}' ) CURRENCY_RE = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]') NUMBER_RE = re.compile( r'%\d+[\.,]?\d*' r'|\d{1,3}(?:\.\d{3})+' # thousands (1.000.000) — before decimal! r'|\d+[\.,]\d+' # decimal (2.5, 10,5) r'|\d+%' r'|\d+/\d+' ) TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?') PLAIN_NUM_RE = re.compile(r'\b\d+\b') # ── Acronym patterns ───────────────────────────────────────────────────────── # Matches standalone uppercase sequences (+ optional trailing digits). # [A-Z]{2,}[0-9]* → HTML, GPT, CSS3, HTML5, MP3 # [A-Z][0-9]+ → F16, H264, A4 # Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word. ACRONYM_RE = re.compile( r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b" r"|\b[A-ZÇĞİÖŞÜ][0-9]+\b" ) # Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten ACRONYM_APOSTROPHE_RE = re.compile( r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:" + _SUFFIX_ALT + r")+\b" ) TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3') UNICODE_EMOJI_RE = re.compile( "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF" "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF" "\U00002700-\U000027BF\U0001F900-\U0001F9FF" "\U00002600-\U000026FF]+", flags=re.UNICODE, ) # Pattern priority: earlier entries win when spans overlap. _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [ (URL_RE, "URL"), (MENTION_RE, "MENTION"), (HASHTAG_RE, "HASHTAG"), (DATE_RE, "DATE"), (CURRENCY_RE, "UNIT"), (NUM_APOSTROPHE_RE, "NUM_APO"), (ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"), (ACRONYM_RE, "ACRONYM"), (NUMBER_RE, "NUM"), (TIME_RE, "NUM"), (PLAIN_NUM_RE, "NUM"), (UNICODE_EMOJI_RE, "EMOJI"), (TEXT_EMOJI_RE, "EMOJI"), ] # ── Acronym vs Turkish word disambiguation ─────────────────────────────────── def _is_known_turkish_word(word_upper: str) -> bool: """Return True if *word_upper* (ALL CAPS) is a known Turkish word. Checks (in order): 1. ACRONYM_EXPANSIONS dict → always acronym (return False) 2. Same dict without trailing digits (HTML5 → HTML) 3. TDK dictionary → Turkish word (return True) 4. Proper nouns list → Turkish word (return True) 5. Otherwise → treat as acronym (return False) """ from ._acronym_dict import ACRONYM_EXPANSIONS # noqa: PLC0415 from ._preprocessor import _turkish_lower, _load_proper_nouns # noqa: PLC0415 from ._tdk_vocab import load_tdk_words # noqa: PLC0415 # Known acronyms always win if word_upper in ACRONYM_EXPANSIONS: return False # Also check without trailing digits (HTML5 → HTML) base = word_upper.rstrip("0123456789") if base and base != word_upper and base in ACRONYM_EXPANSIONS: return False wl = _turkish_lower(word_upper) # TDK dictionary: if the lowercase form is a real Turkish word → not acronym tdk = load_tdk_words() if tdk and wl in tdk: return True # Proper nouns (İstanbul, Ankara…) if wl in _load_proper_nouns(): return True return False # ── Segment-based API ──────────────────────────────────────────────────────── def find_special_spans(text: str) -> list[tuple[int, int, str, str]]: """Find all special-token spans in *text*. Returns a sorted, non-overlapping list of ``(start, end, token_type, original_text)``. """ candidates: list[tuple[int, int, str, str]] = [] for pattern, ttype in _SPAN_PATTERNS: for m in pattern.finditer(text): original = m.group(0) # Acronym filtering: skip if it's actually a Turkish word if ttype in ("ACRONYM", "ACRONYM_APO"): # Extract the uppercase base (before apostrophe for APO) if ttype == "ACRONYM_APO": apo = original.find("'") if apo == -1: apo = original.find("\u2019") acr_base = original[:apo] else: acr_base = original if _is_known_turkish_word(acr_base): continue candidates.append((m.start(), m.end(), ttype, original)) # Sort by start position, then prefer longer match candidates.sort(key=lambda x: (x[0], -(x[1] - x[0]))) # Greedy non-overlapping selection result: list[tuple[int, int, str, str]] = [] last_end = 0 for s, e, t, o in candidates: if s >= last_end: result.append((s, e, t, o)) last_end = e return result def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]: """Split a suffix string (after apostrophe) into individual SUFFIX tokens.""" tokens: list[dict] = [] remaining = suffix_str.lower() while remaining: matched = False for s in _NUM_SUFFIXES: if remaining.startswith(s): tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True}) remaining = remaining[len(s):] matched = True break if not matched: tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True}) break return tokens def make_special_tokens(span_type: str, original: str) -> list[dict]: """Create token dict(s) for a matched special span. ``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens. """ # ── Number + apostrophe + suffix (3'te, 1990'larda) ────────────────── if span_type == "NUM_APO": apo_pos = original.find("'") if apo_pos == -1: apo_pos = original.find("\u2019") num_part = original[:apo_pos] return [ {"token": f" {num_part}", "type": "NUM", "_num": True}, *_split_apostrophe_suffixes(original[apo_pos + 1:]), ] # ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ────────────── if span_type == "ACRONYM_APO": apo_pos = original.find("'") if apo_pos == -1: apo_pos = original.find("\u2019") acr_part = original[:apo_pos] return [ {"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True}, *_split_apostrophe_suffixes(original[apo_pos + 1:]), ] # ── Plain acronym (HTML5, GPT) ────────────────────────────────────── if span_type == "ACRONYM": return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}] # ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ── return [{ "token": f" {original}", "type": span_type, f"_{span_type.lower()}": True, }] # ── Safety-net post-pass ───────────────────────────────────────────────────── def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]: """Catch remaining number/unit tokens missed by span detection.""" result: list[dict] = [] for tok in tokens: if tok["type"] not in ("BPE", "ROOT"): result.append(tok) continue raw = tok["token"].strip() if NUMBER_RE.fullmatch(raw): result.append({**tok, "type": "NUM", "_num": True}) elif raw.lower() in UNITS and tok["type"] == "BPE": result.append({**tok, "type": "UNIT", "_unit": True}) elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE": result.append({**tok, "type": "NUM", "_roman": True}) elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE": result.append({**tok, "type": "ROOT", "_month": True}) else: result.append(tok) return result