File size: 4,349 Bytes
ca41c16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)."""

from __future__ import annotations

import re

MONTH_NAMES = {
    "ocak","şubat","mart","nisan","mayıs","haziran",
    "temmuz","ağustos","eylül","ekim","kasım","aralık",
    "january","february","march","april","may","june",
    "july","august","september","october","november","december",
}

UNITS = {
    "km","m","cm","mm","nm",
    "kg","g","mg","ton",
    "sn","dk","sa","ms",
    "tl","usd","eur","gbp",
    "kb","mb","gb","tb","pb",
    "ml","mcg","meq","iu","mmhg","mosm",
    "hz","mhz","ghz","watt","kw","mw","kcal","cal",
}

ROMAN_NUMERALS = {
    "i","ii","iii","iv","vi","vii","viii","ix",
    "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
}

URL_RE         = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
MENTION_RE     = re.compile(r'@[\w\u00C0-\u024F]+')
HASHTAG_RE     = re.compile(r'#[\w\u00C0-\u024F]+')
NUMBER_RE      = re.compile(
    r'%\d+[\.,]?\d*'
    r'|\d+[\.,]\d+'
    r'|\d{1,3}(?:\.\d{3})+'
    r'|\d+%'
    r'|\d+/\d+'
)
DATE_RE        = re.compile(
    r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
    r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
)
CURRENCY_RE    = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
UNICODE_EMOJI_RE = re.compile(
    "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF\U0001F900-\U0001F9FF"
    "\U00002600-\U000026FF]+",
    flags=re.UNICODE,
)


def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]:
    """Replace special tokens with placeholders before base tokenization."""
    placeholders: list[dict] = []
    counter = [0]

    def _ph(token_type: str, original: str) -> str:
        ph = f"\x00{token_type}{counter[0]}\x00"
        placeholders.append({"placeholder": ph, "type": token_type, "original": original})
        counter[0] += 1
        return ph

    def _replace(pattern: re.Pattern, ttype: str, t: str) -> str:
        return pattern.sub(lambda m: _ph(ttype, m.group(0)), t)

    text = _replace(URL_RE,            "URL",     text)
    text = _replace(MENTION_RE,        "MENTION", text)
    text = _replace(HASHTAG_RE,        "HASHTAG", text)
    text = _replace(DATE_RE,           "DATE",    text)
    text = _replace(CURRENCY_RE,       "UNIT",    text)
    text = _replace(NUMBER_RE,         "NUM",     text)
    text = _replace(UNICODE_EMOJI_RE,  "EMOJI",   text)
    text = _replace(TEXT_EMOJI_RE,     "EMOJI",   text)
    return text, placeholders


def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]:
    """Restore placeholders in the token stream."""
    if not placeholders:
        return tokens

    ph_map   = {p["placeholder"]: p for p in placeholders}
    restored: set[str] = set()
    result: list[dict] = []

    for tok in tokens:
        raw = tok["token"]
        matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None)
        if matched:
            ph, info = matched
            if ph not in restored:
                restored.add(ph)
                ttype = info["type"]
                result.append({
                    "token": f" {info['original']}",
                    "type":  ttype,
                    f"_{ttype.lower()}": True,
                })
        else:
            result.append(tok)

    return result


def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
    """Catch remaining number/unit tokens missed by pre-tokenization."""
    result: list[dict] = []
    for tok in tokens:
        if tok["type"] not in ("BPE", "ROOT"):
            result.append(tok)
            continue

        raw = tok["token"].strip()

        if NUMBER_RE.fullmatch(raw):
            result.append({**tok, "type": "NUM", "_num": True})
        elif raw.lower() in UNITS and tok["type"] == "BPE":
            result.append({**tok, "type": "UNIT", "_unit": True})
        elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
            result.append({**tok, "type": "NUM", "_roman": True})
        elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
            result.append({**tok, "type": "ROOT", "_month": True})
        else:
            result.append(tok)

    return result