Initial release: TurkTokenizer v1.0.0 — TR-MMLU 92%

ca41c16 verified 5 days ago

4.35 kB

	"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)."""

	from __future__ import annotations

	import re

	MONTH_NAMES = {
	"ocak","şubat","mart","nisan","mayıs","haziran",
	"temmuz","ağustos","eylül","ekim","kasım","aralık",
	"january","february","march","april","may","june",
	"july","august","september","october","november","december",
	}

	UNITS = {
	"km","m","cm","mm","nm",
	"kg","g","mg","ton",
	"sn","dk","sa","ms",
	"tl","usd","eur","gbp",
	"kb","mb","gb","tb","pb",
	"ml","mcg","meq","iu","mmhg","mosm",
	"hz","mhz","ghz","watt","kw","mw","kcal","cal",
	}

	ROMAN_NUMERALS = {
	"i","ii","iii","iv","vi","vii","viii","ix",
	"xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
	}

	URL_RE = re.compile(r'https?://\S+\|www\.\S+', re.IGNORECASE)
	MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+')
	HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+')
	NUMBER_RE = re.compile(
	r'%\d+[\.,]?\d*'
	r'\|\d+[\.,]\d+'
	r'\|\d{1,3}(?:\.\d{3})+'
	r'\|\d+%'
	r'\|\d+/\d+'
	)
	DATE_RE = re.compile(
	r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
	r'\|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
	)
	CURRENCY_RE = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d\|\d+[\.,]?\d[$€£¥₺₽]')
	TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]\|<3')
	UNICODE_EMOJI_RE = re.compile(
	"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
	"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
	"\U00002700-\U000027BF\U0001F900-\U0001F9FF"
	"\U00002600-\U000026FF]+",
	flags=re.UNICODE,
	)


	def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]:
	"""Replace special tokens with placeholders before base tokenization."""
	placeholders: list[dict] = []
	counter = [0]

	def _ph(token_type: str, original: str) -> str:
	ph = f"\x00{token_type}{counter[0]}\x00"
	placeholders.append({"placeholder": ph, "type": token_type, "original": original})
	counter[0] += 1
	return ph

	def _replace(pattern: re.Pattern, ttype: str, t: str) -> str:
	return pattern.sub(lambda m: _ph(ttype, m.group(0)), t)

	text = _replace(URL_RE, "URL", text)
	text = _replace(MENTION_RE, "MENTION", text)
	text = _replace(HASHTAG_RE, "HASHTAG", text)
	text = _replace(DATE_RE, "DATE", text)
	text = _replace(CURRENCY_RE, "UNIT", text)
	text = _replace(NUMBER_RE, "NUM", text)
	text = _replace(UNICODE_EMOJI_RE, "EMOJI", text)
	text = _replace(TEXT_EMOJI_RE, "EMOJI", text)
	return text, placeholders


	def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]:
	"""Restore placeholders in the token stream."""
	if not placeholders:
	return tokens

	ph_map = {p["placeholder"]: p for p in placeholders}
	restored: set[str] = set()
	result: list[dict] = []

	for tok in tokens:
	raw = tok["token"]
	matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None)
	if matched:
	ph, info = matched
	if ph not in restored:
	restored.add(ph)
	ttype = info["type"]
	result.append({
	"token": f" {info['original']}",
	"type": ttype,
	f"_{ttype.lower()}": True,
	})
	else:
	result.append(tok)

	return result


	def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
	"""Catch remaining number/unit tokens missed by pre-tokenization."""
	result: list[dict] = []
	for tok in tokens:
	if tok["type"] not in ("BPE", "ROOT"):
	result.append(tok)
	continue

	raw = tok["token"].strip()

	if NUMBER_RE.fullmatch(raw):
	result.append({**tok, "type": "NUM", "_num": True})
	elif raw.lower() in UNITS and tok["type"] == "BPE":
	result.append({**tok, "type": "UNIT", "_unit": True})
	elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
	result.append({**tok, "type": "NUM", "_roman": True})
	elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
	result.append({**tok, "type": "ROOT", "_month": True})
	else:
	result.append(tok)

	return result