File size: 10,738 Bytes

"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).

Uses a segment-based approach: special tokens are detected and extracted
*before* the base tokenizer runs, so they never pass through it.
"""

from __future__ import annotations

import re

MONTH_NAMES = {
    "ocak","şubat","mart","nisan","mayıs","haziran",
    "temmuz","ağustos","eylül","ekim","kasım","aralık",
    "january","february","march","april","may","june",
    "july","august","september","october","november","december",
}

UNITS = {
    "km","m","cm","mm","nm",
    "kg","g","mg","ton",
    "sn","dk","sa","ms",
    "tl","usd","eur","gbp",
    "kb","mb","gb","tb","pb",
    "ml","mcg","meq","iu","mmhg","mosm",
    "hz","mhz","ghz","watt","kw","mw","kcal","cal",
}

ROMAN_NUMERALS = {
    "i","ii","iii","iv","vi","vii","viii","ix",
    "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
}

# ── Regex patterns ────────────────────────────────────────────────────────────

URL_RE         = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
MENTION_RE     = re.compile(r'@[\w\u00C0-\u024F]+')
HASHTAG_RE     = re.compile(r'#[\w\u00C0-\u024F]+')

# Turkish suffixes that can follow a number+apostrophe
_NUM_SUFFIXES = sorted(
    [
        "nın","nin","nun","nün","dan","den","tan","ten",
        "da","de","ta","te","ya","ye","nda","nde",
        "yı","yi","yu","yü","nı","ni","nu","nü",
        "lar","ler","lara","lere","ları","leri",
        "ım","im","um","üm","ın","in","un","ün",
        "mız","miz","muz","müz","nız","niz","nuz","nüz",
        "dır","dir","dur","dür","tır","tir","tur","tür",
        "ki","li","lı","lu","lü","sız","siz","suz","süz",
        "inci","ıncı","uncu","üncü","nci","ncı",
        "lık","lik","luk","lük",
        "a","e","ı","i","u","ü",
    ],
    key=len,
    reverse=True,
)

_SUFFIX_ALT = '|'.join(re.escape(s) for s in _NUM_SUFFIXES)

# Number (or time) followed by apostrophe + Turkish suffix(es)
NUM_APOSTROPHE_RE = re.compile(
    r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
    re.IGNORECASE,
)

DATE_RE        = re.compile(
    r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
    r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
)
CURRENCY_RE    = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
NUMBER_RE      = re.compile(
    r'%\d+[\.,]?\d*'
    r'|\d{1,3}(?:\.\d{3})+'       # thousands (1.000.000) — before decimal!
    r'|\d+[\.,]\d+'               # decimal (2.5, 10,5)
    r'|\d+%'
    r'|\d+/\d+'
)
TIME_RE        = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
PLAIN_NUM_RE   = re.compile(r'\b\d+\b')

# ── Acronym patterns ─────────────────────────────────────────────────────────
# Matches standalone uppercase sequences (+ optional trailing digits).
#   [A-Z]{2,}[0-9]*  → HTML, GPT, CSS3, HTML5, MP3
#   [A-Z][0-9]+      → F16, H264, A4
# Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
ACRONYM_RE = re.compile(
    r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b"
    r"|\b[A-ZÇĞİÖŞÜ][0-9]+\b"
)

# Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten
ACRONYM_APOSTROPHE_RE = re.compile(
    r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:"
    + _SUFFIX_ALT + r")+\b"
)

TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
UNICODE_EMOJI_RE = re.compile(
    "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF\U0001F900-\U0001F9FF"
    "\U00002600-\U000026FF]+",
    flags=re.UNICODE,
)

# Pattern priority: earlier entries win when spans overlap.
_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
    (URL_RE,                 "URL"),
    (MENTION_RE,             "MENTION"),
    (HASHTAG_RE,             "HASHTAG"),
    (DATE_RE,                "DATE"),
    (CURRENCY_RE,            "UNIT"),
    (NUM_APOSTROPHE_RE,      "NUM_APO"),
    (ACRONYM_APOSTROPHE_RE,  "ACRONYM_APO"),
    (ACRONYM_RE,             "ACRONYM"),
    (NUMBER_RE,              "NUM"),
    (TIME_RE,                "NUM"),
    (PLAIN_NUM_RE,           "NUM"),
    (UNICODE_EMOJI_RE,       "EMOJI"),
    (TEXT_EMOJI_RE,          "EMOJI"),
]


# ── Acronym vs Turkish word disambiguation ───────────────────────────────────

def _is_known_turkish_word(word_upper: str) -> bool:
    """Return True if *word_upper* (ALL CAPS) is a known Turkish word.

    Checks (in order):
    1. ACRONYM_EXPANSIONS dict → always acronym (return False)
    2. Same dict without trailing digits (HTML5 → HTML)
    3. TDK dictionary → Turkish word (return True)
    4. Proper nouns list → Turkish word (return True)
    5. Otherwise → treat as acronym (return False)
    """
    from ._acronym_dict import ACRONYM_EXPANSIONS  # noqa: PLC0415
    from ._preprocessor import _turkish_lower, _load_proper_nouns  # noqa: PLC0415
    from ._tdk_vocab import load_tdk_words  # noqa: PLC0415

    # Known acronyms always win
    if word_upper in ACRONYM_EXPANSIONS:
        return False
    # Also check without trailing digits (HTML5 → HTML)
    base = word_upper.rstrip("0123456789")
    if base and base != word_upper and base in ACRONYM_EXPANSIONS:
        return False

    wl = _turkish_lower(word_upper)

    # TDK dictionary: if the lowercase form is a real Turkish word → not acronym
    tdk = load_tdk_words()
    if tdk and wl in tdk:
        return True

    # Proper nouns (İstanbul, Ankara…)
    if wl in _load_proper_nouns():
        return True

    return False


# ── Segment-based API ────────────────────────────────────────────────────────

def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
    """Find all special-token spans in *text*.

    Returns a sorted, non-overlapping list of
    ``(start, end, token_type, original_text)``.
    """
    candidates: list[tuple[int, int, str, str]] = []
    for pattern, ttype in _SPAN_PATTERNS:
        for m in pattern.finditer(text):
            original = m.group(0)

            # Acronym filtering: skip if it's actually a Turkish word
            if ttype in ("ACRONYM", "ACRONYM_APO"):
                # Extract the uppercase base (before apostrophe for APO)
                if ttype == "ACRONYM_APO":
                    apo = original.find("'")
                    if apo == -1:
                        apo = original.find("\u2019")
                    acr_base = original[:apo]
                else:
                    acr_base = original
                if _is_known_turkish_word(acr_base):
                    continue

            candidates.append((m.start(), m.end(), ttype, original))

    # Sort by start position, then prefer longer match
    candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))

    # Greedy non-overlapping selection
    result: list[tuple[int, int, str, str]] = []
    last_end = 0
    for s, e, t, o in candidates:
        if s >= last_end:
            result.append((s, e, t, o))
            last_end = e
    return result


def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]:
    """Split a suffix string (after apostrophe) into individual SUFFIX tokens."""
    tokens: list[dict] = []
    remaining = suffix_str.lower()
    while remaining:
        matched = False
        for s in _NUM_SUFFIXES:
            if remaining.startswith(s):
                tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
                remaining = remaining[len(s):]
                matched = True
                break
        if not matched:
            tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
            break
    return tokens


def make_special_tokens(span_type: str, original: str) -> list[dict]:
    """Create token dict(s) for a matched special span.

    ``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens.
    """
    # ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
    if span_type == "NUM_APO":
        apo_pos = original.find("'")
        if apo_pos == -1:
            apo_pos = original.find("\u2019")
        num_part = original[:apo_pos]
        return [
            {"token": f" {num_part}", "type": "NUM", "_num": True},
            *_split_apostrophe_suffixes(original[apo_pos + 1:]),
        ]

    # ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
    if span_type == "ACRONYM_APO":
        apo_pos = original.find("'")
        if apo_pos == -1:
            apo_pos = original.find("\u2019")
        acr_part = original[:apo_pos]
        return [
            {"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True},
            *_split_apostrophe_suffixes(original[apo_pos + 1:]),
        ]

    # ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
    if span_type == "ACRONYM":
        return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}]

    # ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
    return [{
        "token": f" {original}",
        "type": span_type,
        f"_{span_type.lower()}": True,
    }]


# ── Safety-net post-pass ─────────────────────────────────────────────────────

def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
    """Catch remaining number/unit tokens missed by span detection."""
    result: list[dict] = []
    for tok in tokens:
        if tok["type"] not in ("BPE", "ROOT"):
            result.append(tok)
            continue

        raw = tok["token"].strip()

        if NUMBER_RE.fullmatch(raw):
            result.append({**tok, "type": "NUM", "_num": True})
        elif raw.lower() in UNITS and tok["type"] == "BPE":
            result.append({**tok, "type": "UNIT", "_unit": True})
        elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
            result.append({**tok, "type": "NUM", "_roman": True})
        elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
            result.append({**tok, "type": "ROOT", "_month": True})
        else:
            result.append(tok)

    return result