"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).""" from __future__ import annotations import re MONTH_NAMES = { "ocak","şubat","mart","nisan","mayıs","haziran", "temmuz","ağustos","eylül","ekim","kasım","aralık", "january","february","march","april","may","june", "july","august","september","october","november","december", } UNITS = { "km","m","cm","mm","nm", "kg","g","mg","ton", "sn","dk","sa","ms", "tl","usd","eur","gbp", "kb","mb","gb","tb","pb", "ml","mcg","meq","iu","mmhg","mosm", "hz","mhz","ghz","watt","kw","mw","kcal","cal", } ROMAN_NUMERALS = { "i","ii","iii","iv","vi","vii","viii","ix", "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx", } URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE) MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+') HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+') NUMBER_RE = re.compile( r'%\d+[\.,]?\d*' r'|\d+[\.,]\d+' r'|\d{1,3}(?:\.\d{3})+' r'|\d+%' r'|\d+/\d+' ) DATE_RE = re.compile( r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}' r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}' ) CURRENCY_RE = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]') TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3') UNICODE_EMOJI_RE = re.compile( "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF" "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF" "\U00002700-\U000027BF\U0001F900-\U0001F9FF" "\U00002600-\U000026FF]+", flags=re.UNICODE, ) def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]: """Replace special tokens with placeholders before base tokenization.""" placeholders: list[dict] = [] counter = [0] def _ph(token_type: str, original: str) -> str: ph = f"\x00{token_type}{counter[0]}\x00" placeholders.append({"placeholder": ph, "type": token_type, "original": original}) counter[0] += 1 return ph def _replace(pattern: re.Pattern, ttype: str, t: str) -> str: return pattern.sub(lambda m: _ph(ttype, m.group(0)), t) text = _replace(URL_RE, "URL", text) text = _replace(MENTION_RE, "MENTION", text) text = _replace(HASHTAG_RE, "HASHTAG", text) text = _replace(DATE_RE, "DATE", text) text = _replace(CURRENCY_RE, "UNIT", text) text = _replace(NUMBER_RE, "NUM", text) text = _replace(UNICODE_EMOJI_RE, "EMOJI", text) text = _replace(TEXT_EMOJI_RE, "EMOJI", text) return text, placeholders def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]: """Restore placeholders in the token stream.""" if not placeholders: return tokens ph_map = {p["placeholder"]: p for p in placeholders} restored: set[str] = set() result: list[dict] = [] for tok in tokens: raw = tok["token"] matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None) if matched: ph, info = matched if ph not in restored: restored.add(ph) ttype = info["type"] result.append({ "token": f" {info['original']}", "type": ttype, f"_{ttype.lower()}": True, }) else: result.append(tok) return result def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]: """Catch remaining number/unit tokens missed by pre-tokenization.""" result: list[dict] = [] for tok in tokens: if tok["type"] not in ("BPE", "ROOT"): result.append(tok) continue raw = tok["token"].strip() if NUMBER_RE.fullmatch(raw): result.append({**tok, "type": "NUM", "_num": True}) elif raw.lower() in UNITS and tok["type"] == "BPE": result.append({**tok, "type": "UNIT", "_unit": True}) elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE": result.append({**tok, "type": "NUM", "_roman": True}) elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE": result.append({**tok, "type": "ROOT", "_month": True}) else: result.append(tok) return result