| """Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).""" |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| MONTH_NAMES = { |
| "ocak","şubat","mart","nisan","mayıs","haziran", |
| "temmuz","ağustos","eylül","ekim","kasım","aralık", |
| "january","february","march","april","may","june", |
| "july","august","september","october","november","december", |
| } |
|
|
| UNITS = { |
| "km","m","cm","mm","nm", |
| "kg","g","mg","ton", |
| "sn","dk","sa","ms", |
| "tl","usd","eur","gbp", |
| "kb","mb","gb","tb","pb", |
| "ml","mcg","meq","iu","mmhg","mosm", |
| "hz","mhz","ghz","watt","kw","mw","kcal","cal", |
| } |
|
|
| ROMAN_NUMERALS = { |
| "i","ii","iii","iv","vi","vii","viii","ix", |
| "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx", |
| } |
|
|
| URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE) |
| MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+') |
| HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+') |
| NUMBER_RE = re.compile( |
| r'%\d+[\.,]?\d*' |
| r'|\d+[\.,]\d+' |
| r'|\d{1,3}(?:\.\d{3})+' |
| r'|\d+%' |
| r'|\d+/\d+' |
| ) |
| DATE_RE = re.compile( |
| r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}' |
| r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}' |
| ) |
| CURRENCY_RE = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]') |
| TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3') |
| UNICODE_EMOJI_RE = re.compile( |
| "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF" |
| "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF" |
| "\U00002700-\U000027BF\U0001F900-\U0001F9FF" |
| "\U00002600-\U000026FF]+", |
| flags=re.UNICODE, |
| ) |
|
|
|
|
| def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]: |
| """Replace special tokens with placeholders before base tokenization.""" |
| placeholders: list[dict] = [] |
| counter = [0] |
|
|
| def _ph(token_type: str, original: str) -> str: |
| ph = f"\x00{token_type}{counter[0]}\x00" |
| placeholders.append({"placeholder": ph, "type": token_type, "original": original}) |
| counter[0] += 1 |
| return ph |
|
|
| def _replace(pattern: re.Pattern, ttype: str, t: str) -> str: |
| return pattern.sub(lambda m: _ph(ttype, m.group(0)), t) |
|
|
| text = _replace(URL_RE, "URL", text) |
| text = _replace(MENTION_RE, "MENTION", text) |
| text = _replace(HASHTAG_RE, "HASHTAG", text) |
| text = _replace(DATE_RE, "DATE", text) |
| text = _replace(CURRENCY_RE, "UNIT", text) |
| text = _replace(NUMBER_RE, "NUM", text) |
| text = _replace(UNICODE_EMOJI_RE, "EMOJI", text) |
| text = _replace(TEXT_EMOJI_RE, "EMOJI", text) |
| return text, placeholders |
|
|
|
|
| def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]: |
| """Restore placeholders in the token stream.""" |
| if not placeholders: |
| return tokens |
|
|
| ph_map = {p["placeholder"]: p for p in placeholders} |
| restored: set[str] = set() |
| result: list[dict] = [] |
|
|
| for tok in tokens: |
| raw = tok["token"] |
| matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None) |
| if matched: |
| ph, info = matched |
| if ph not in restored: |
| restored.add(ph) |
| ttype = info["type"] |
| result.append({ |
| "token": f" {info['original']}", |
| "type": ttype, |
| f"_{ttype.lower()}": True, |
| }) |
| else: |
| result.append(tok) |
|
|
| return result |
|
|
|
|
| def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]: |
| """Catch remaining number/unit tokens missed by pre-tokenization.""" |
| result: list[dict] = [] |
| for tok in tokens: |
| if tok["type"] not in ("BPE", "ROOT"): |
| result.append(tok) |
| continue |
|
|
| raw = tok["token"].strip() |
|
|
| if NUMBER_RE.fullmatch(raw): |
| result.append({**tok, "type": "NUM", "_num": True}) |
| elif raw.lower() in UNITS and tok["type"] == "BPE": |
| result.append({**tok, "type": "UNIT", "_unit": True}) |
| elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE": |
| result.append({**tok, "type": "NUM", "_roman": True}) |
| elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE": |
| result.append({**tok, "type": "ROOT", "_month": True}) |
| else: |
| result.append(tok) |
|
|
| return result |
|
|