turk-tokenizer / turk_tokenizer /_normalizer.py
nmstech's picture
Initial release: TurkTokenizer v1.0.0 — TR-MMLU 92%
ca41c16 verified
"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)."""
from __future__ import annotations
import re
MONTH_NAMES = {
"ocak","şubat","mart","nisan","mayıs","haziran",
"temmuz","ağustos","eylül","ekim","kasım","aralık",
"january","february","march","april","may","june",
"july","august","september","october","november","december",
}
UNITS = {
"km","m","cm","mm","nm",
"kg","g","mg","ton",
"sn","dk","sa","ms",
"tl","usd","eur","gbp",
"kb","mb","gb","tb","pb",
"ml","mcg","meq","iu","mmhg","mosm",
"hz","mhz","ghz","watt","kw","mw","kcal","cal",
}
ROMAN_NUMERALS = {
"i","ii","iii","iv","vi","vii","viii","ix",
"xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
}
URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+')
HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+')
NUMBER_RE = re.compile(
r'%\d+[\.,]?\d*'
r'|\d+[\.,]\d+'
r'|\d{1,3}(?:\.\d{3})+'
r'|\d+%'
r'|\d+/\d+'
)
DATE_RE = re.compile(
r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
)
CURRENCY_RE = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
UNICODE_EMOJI_RE = re.compile(
"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
"\U00002700-\U000027BF\U0001F900-\U0001F9FF"
"\U00002600-\U000026FF]+",
flags=re.UNICODE,
)
def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]:
"""Replace special tokens with placeholders before base tokenization."""
placeholders: list[dict] = []
counter = [0]
def _ph(token_type: str, original: str) -> str:
ph = f"\x00{token_type}{counter[0]}\x00"
placeholders.append({"placeholder": ph, "type": token_type, "original": original})
counter[0] += 1
return ph
def _replace(pattern: re.Pattern, ttype: str, t: str) -> str:
return pattern.sub(lambda m: _ph(ttype, m.group(0)), t)
text = _replace(URL_RE, "URL", text)
text = _replace(MENTION_RE, "MENTION", text)
text = _replace(HASHTAG_RE, "HASHTAG", text)
text = _replace(DATE_RE, "DATE", text)
text = _replace(CURRENCY_RE, "UNIT", text)
text = _replace(NUMBER_RE, "NUM", text)
text = _replace(UNICODE_EMOJI_RE, "EMOJI", text)
text = _replace(TEXT_EMOJI_RE, "EMOJI", text)
return text, placeholders
def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]:
"""Restore placeholders in the token stream."""
if not placeholders:
return tokens
ph_map = {p["placeholder"]: p for p in placeholders}
restored: set[str] = set()
result: list[dict] = []
for tok in tokens:
raw = tok["token"]
matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None)
if matched:
ph, info = matched
if ph not in restored:
restored.add(ph)
ttype = info["type"]
result.append({
"token": f" {info['original']}",
"type": ttype,
f"_{ttype.lower()}": True,
})
else:
result.append(tok)
return result
def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
"""Catch remaining number/unit tokens missed by pre-tokenization."""
result: list[dict] = []
for tok in tokens:
if tok["type"] not in ("BPE", "ROOT"):
result.append(tok)
continue
raw = tok["token"].strip()
if NUMBER_RE.fullmatch(raw):
result.append({**tok, "type": "NUM", "_num": True})
elif raw.lower() in UNITS and tok["type"] == "BPE":
result.append({**tok, "type": "UNIT", "_unit": True})
elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
result.append({**tok, "type": "NUM", "_roman": True})
elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
result.append({**tok, "type": "ROOT", "_month": True})
else:
result.append(tok)
return result