File size: 4,349 Bytes

ca41c16

"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)."""

from __future__ import annotations

import re

MONTH_NAMES = {
    "ocak","şubat","mart","nisan","mayıs","haziran",
    "temmuz","ağustos","eylül","ekim","kasım","aralık",
    "january","february","march","april","may","june",
    "july","august","september","october","november","december",
}

UNITS = {
    "km","m","cm","mm","nm",
    "kg","g","mg","ton",
    "sn","dk","sa","ms",
    "tl","usd","eur","gbp",
    "kb","mb","gb","tb","pb",
    "ml","mcg","meq","iu","mmhg","mosm",
    "hz","mhz","ghz","watt","kw","mw","kcal","cal",
}

ROMAN_NUMERALS = {
    "i","ii","iii","iv","vi","vii","viii","ix",
    "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
}

URL_RE         = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
MENTION_RE     = re.compile(r'@[\w\u00C0-\u024F]+')
HASHTAG_RE     = re.compile(r'#[\w\u00C0-\u024F]+')
NUMBER_RE      = re.compile(
    r'%\d+[\.,]?\d*'
    r'|\d+[\.,]\d+'
    r'|\d{1,3}(?:\.\d{3})+'
    r'|\d+%'
    r'|\d+/\d+'
)
DATE_RE        = re.compile(
    r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
    r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
)
CURRENCY_RE    = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
UNICODE_EMOJI_RE = re.compile(
    "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF\U0001F900-\U0001F9FF"
    "\U00002600-\U000026FF]+",
    flags=re.UNICODE,
)


def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]:
    """Replace special tokens with placeholders before base tokenization."""
    placeholders: list[dict] = []
    counter = [0]

    def _ph(token_type: str, original: str) -> str:
        ph = f"\x00{token_type}{counter[0]}\x00"
        placeholders.append({"placeholder": ph, "type": token_type, "original": original})
        counter[0] += 1
        return ph

    def _replace(pattern: re.Pattern, ttype: str, t: str) -> str:
        return pattern.sub(lambda m: _ph(ttype, m.group(0)), t)

    text = _replace(URL_RE,            "URL",     text)
    text = _replace(MENTION_RE,        "MENTION", text)
    text = _replace(HASHTAG_RE,        "HASHTAG", text)
    text = _replace(DATE_RE,           "DATE",    text)
    text = _replace(CURRENCY_RE,       "UNIT",    text)
    text = _replace(NUMBER_RE,         "NUM",     text)
    text = _replace(UNICODE_EMOJI_RE,  "EMOJI",   text)
    text = _replace(TEXT_EMOJI_RE,     "EMOJI",   text)
    return text, placeholders


def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]:
    """Restore placeholders in the token stream."""
    if not placeholders:
        return tokens

    ph_map   = {p["placeholder"]: p for p in placeholders}
    restored: set[str] = set()
    result: list[dict] = []

    for tok in tokens:
        raw = tok["token"]
        matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None)
        if matched:
            ph, info = matched
            if ph not in restored:
                restored.add(ph)
                ttype = info["type"]
                result.append({
                    "token": f" {info['original']}",
                    "type":  ttype,
                    f"_{ttype.lower()}": True,
                })
        else:
            result.append(tok)

    return result


def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
    """Catch remaining number/unit tokens missed by pre-tokenization."""
    result: list[dict] = []
    for tok in tokens:
        if tok["type"] not in ("BPE", "ROOT"):
            result.append(tok)
            continue

        raw = tok["token"].strip()

        if NUMBER_RE.fullmatch(raw):
            result.append({**tok, "type": "NUM", "_num": True})
        elif raw.lower() in UNITS and tok["type"] == "BPE":
            result.append({**tok, "type": "UNIT", "_unit": True})
        elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
            result.append({**tok, "type": "NUM", "_roman": True})
        elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
            result.append({**tok, "type": "ROOT", "_month": True})
        else:
            result.append(tok)

    return result