turk-tokenizer / turk_tokenizer /_suffix_expander.py
nmstech's picture
Initial release: TurkTokenizer v1.0.0 — TR-MMLU 92%
ca41c16 verified
"""Fix 3: BPE → SUFFIX reclassification. Fix 5: Punctuation → PUNCT."""
from __future__ import annotations
PUNCT_CHARS = set(
'?.,;:!-\u2013\u2014()[]{}"`/\\|@#$%^&*+=<>~'
'\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a'
'\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7'
)
_PUNCT_DIGITS = set("0123456789")
def _is_punct(token: str) -> bool:
s = token.strip()
if not s:
return False
return all(
c in PUNCT_CHARS or c in _PUNCT_DIGITS or (ord(c) > 0x02FF and not c.isalpha())
for c in s
)
# ── Suffix dictionary (260+ entries) ─────────────────────────────────────────
EXTENDED_SUFFIX_MAP: dict[str, str] = {
# Plural + case
"leri": "-PL+ACC", "ları": "-PL+ACC",
"lere": "-PL+DAT", "lara": "-PL+DAT",
"lerin": "-PL+GEN", "ların": "-PL+GEN",
"lerde": "-PL+LOC", "larda": "-PL+LOC",
"lerden": "-PL+ABL","lardan": "-PL+ABL",
"lerle": "-PL+INS", "larla": "-PL+INS",
"lerce": "-PL+EQU", "larca": "-PL+EQU",
# -yon / loanword suffixes
"yon": "-YON", "iyon": "-YON", "asyon": "-YON", "izasyon": "-YON",
# Adjective derivation
"al": "-ADJ", "el": "-ADJ", "ik": "-ADJ",
"sal": "-ADJ.TR", "sel": "-ADJ.TR",
# 1st/2nd plural possessive
"imiz": "-P1PL", "ımız": "-P1PL", "umuz": "-P1PL", "ümüz": "-P1PL",
"iniz": "-P2PL", "ınız": "-P2PL", "unuz": "-P2PL", "ünüz": "-P2PL",
# Arabic long vowels
"\u00e2": "-LONG_A", "\u00ee": "-LONG_I", "\u00fb": "-LONG_U",
# Roman numerals
"ii": "-ROM", "iii": "-ROM", "iv": "-ROM", "vi": "-ROM",
"vii": "-ROM", "viii": "-ROM", "ix": "-ROM", "xi": "-ROM",
"xii": "-ROM", "xiii": "-ROM", "xiv": "-ROM", "xv": "-ROM",
# Frequent BPE pieces
"eri": "-PL.SFX", "una": "-P3+DAT", "iril": "-PASS.SFX",
"yan": "-PART.ACT","ren": "-PART.ACT", "ıda": "-LOC.SFX",
"maya": "-NEG.INF", "üler": "-PL.SFX", "ıler": "-PL.SFX",
"ni": "-ACC.SFX", "ri": "-PL.SFX", "lan": "-PASS+NZ",
"on": "-YON.SFX",
# Possessive + case compounds
"ımı": "-P1+ACC", "imi": "-P1+ACC", "umu": "-P1+ACC", "ümü": "-P1+ACC",
"ıyla": "-INS.COMP","iyle": "-INS.COMP","uyla": "-INS.COMP","üyle": "-INS.COMP",
"kten": "-ABL.COMP","ğından": "-ABL.COMP","ğinden": "-ABL.COMP",
"yla": "-COM", "yle": "-COM",
# Abstract noun + possessive
"liği": "-ABSTR+P3", "lığı": "-ABSTR+P3",
"luğu": "-ABSTR+P3", "lüğü": "-ABSTR+P3",
"liğini": "-ABSTR+P3+ACC", "lığını": "-ABSTR+P3+ACC",
# -izm (ideology)
"izm": "-ISM", "izmi": "-ISM+P3", "izmde": "-ISM+LOC",
"izmden": "-ISM+ABL", "izmin": "-ISM+GEN",
# Aorist
"lir": "-AOR3SG", "lır": "-AOR3SG", "lur": "-AOR3SG", "lür": "-AOR3SG",
# 3sg possessive + case
"ine": "-P3+DAT", "ına": "-P3+DAT", "une": "-P3+DAT", "üne": "-P3+DAT",
"inde": "-P3+LOC", "ında": "-P3+LOC", "unda": "-P3+LOC", "ünde": "-P3+LOC",
"ini": "-P3+ACC", "ını": "-P3+ACC", "unu": "-P3+ACC", "ünü": "-P3+ACC",
"inden": "-P3+ABL","ından": "-P3+ABL","undan": "-P3+ABL","ünden": "-P3+ABL",
# -daki
"daki": "-LOC+REL","deki": "-LOC+REL","taki": "-LOC+REL","teki": "-LOC+REL",
# Passive + nominalization
"lan": "-PASS+NZ", "len": "-PASS+NZ",
# Verbal noun
"mesi": "-VN3", "ması": "-VN3",
"mesini": "-VN3+ACC", "masını": "-VN3+ACC",
"mesine": "-VN3+DAT", "masına": "-VN3+DAT",
"mesinde": "-VN3+LOC", "masında": "-VN3+LOC",
# Genitive + possessive
"ının": "-GEN+P", "inin": "-GEN+P", "unun": "-GEN+P", "ünün": "-GEN+P",
# Participle
"diği": "-PART", "dığı": "-PART", "tiği": "-PART", "tığı": "-PART",
"duğu": "-PART", "düğü": "-PART", "tuğu": "-PART", "tüğü": "-PART",
"ği": "-PART.SFX","ğı": "-PART.SFX","gu": "-PART.SFX","gü": "-PART.SFX",
# Negative verbal noun
"mas": "-NEG.VN", "mes": "-NEG.VN",
# 2sg imperative
"sin": "-IMP2", "sın": "-IMP2", "sun": "-IMP2", "sün": "-IMP2",
# Passive short
"ıl": "-PASS", "il": "-PASS", "ul": "-PASS", "ül": "-PASS",
# Causative + VN
"irme": "-CAUS+VN","ırma": "-CAUS+VN","urma": "-CAUS+VN",
"ürme": "-CAUS+VN","erme": "-CAUS+VN","arma": "-CAUS+VN",
# Accusative
"ı": "-ACC", "i": "-ACC", "u": "-ACC", "ü": "-ACC",
# Past tense
"dım": "-DI1SG","dim": "-DI1SG","dum": "-DI1SG","düm": "-DI1SG",
"tım": "-DI1SG","tim": "-DI1SG","tum": "-DI1SG","tüm": "-DI1SG",
"dık": "-DI1PL","dik": "-DI1PL","duk": "-DI1PL","dük": "-DI1PL",
"tık": "-DI1PL","tik": "-DI1PL","tuk": "-DI1PL","tük": "-DI1PL",
"dın": "-DI2SG","din": "-DI2SG","dun": "-DI2SG","dün": "-DI2SG",
"tın": "-DI2SG","tin": "-DI2SG","tun": "-DI2SG","tün": "-DI2SG",
"d": "-PAST", "t": "-PAST",
# Conditional
"sa": "-COND", "se": "-COND",
# Progressive
"yor": "-PROG",
# Simple past
"dı": "-PST", "di": "-PST", "du": "-PST", "dü": "-PST",
"tı": "-PST", "ti": "-PST", "tu": "-PST", "tü": "-PST",
# Aorist short
"ir": "-AOR", "ır": "-AOR", "ur": "-AOR", "ür": "-AOR",
"er": "-AOR", "ar": "-AOR",
# Evidential past
"mış": "-EVID","miş": "-EVID","muş": "-EVID","müş": "-EVID",
# Negation
"ma": "-NEG", "me": "-NEG",
"lama": "-VN+NEG","leme": "-VN+NEG",
# Abilitative
"bil": "-ABIL",
# Necessitative
"malı": "-NECES","meli": "-NECES",
# Infinitive
"mak": "-INF", "mek": "-INF",
# -ken (while/when)
"ken": "-WHEN",
# Converb
"arak": "-CONV","erek": "-CONV",
# With / without
"lı": "-WITH", "li": "-WITH", "lu": "-WITH", "lü": "-WITH",
# Agentive
"cı": "-AGT", "ci": "-AGT", "cu": "-AGT", "cü": "-AGT",
"çı": "-AGT", "çi": "-AGT", "çu": "-AGT", "çü": "-AGT",
# Abstract noun
"lık": "-ABSTR","lik": "-ABSTR","luk": "-ABSTR","lük": "-ABSTR",
"lığ": "-ABSTR","liğ": "-ABSTR",
# Optative 1pl
"elim": "-OPT1PL","alım": "-OPT1PL",
# Person suffixes
"ım": "-1SG", "im": "-1SG", "um": "-1SG", "üm": "-1SG",
"ın": "-2SG", "in": "-2SG", "un": "-2SG", "ün": "-2SG",
"iz": "-1PL", "ız": "-1PL", "uz": "-1PL", "üz": "-1PL",
"nız": "-2PL","niz": "-2PL","nuz": "-2PL","nüz": "-2PL",
# Question
"mı": "-Q", "mi": "-Q", "mu": "-Q", "mü": "-Q",
# Dative
"a": "-DAT", "e": "-DAT", "ya": "-DAT", "ye": "-DAT",
# Ablative
"dan": "-ABL","den": "-ABL","tan": "-ABL","ten": "-ABL",
# Locative
"da": "-LOC", "de": "-LOC", "ta": "-LOC", "te": "-LOC",
# Plural
"lar": "-PL", "ler": "-PL",
# 3sg possessive short
"sı": "-P3", "si": "-P3", "su": "-P3", "sü": "-P3",
# Genitive
"nin": "-GEN","nın": "-GEN","nun": "-GEN","nün": "-GEN",
# Instrumental
"le": "-INS", "la": "-INS",
# Equative
"ce": "-EQU","ca": "-EQU","çe": "-EQU","ça": "-EQU",
# Glide
"y": "-GLIDE",
}
_SUFFIX_MAP_SORTED = sorted(
EXTENDED_SUFFIX_MAP.items(), key=lambda x: len(x[0]), reverse=True
)
def reclassify_bpe_suffixes(tokens: list[dict]) -> list[dict]:
"""Reclassify BPE tokens: punctuation → PUNCT, word-internal suffixes → SUFFIX."""
result: list[dict] = []
for tok in tokens:
if tok["type"] != "BPE":
result.append(tok)
continue
raw = tok["token"]
stripped = raw.strip()
if _is_punct(raw):
result.append({**tok, "type": "PUNCT", "_punct": True})
continue
# Only reclassify tokens without a leading space (word-internal)
if raw != stripped:
result.append(tok)
continue
prev_ok = bool(result) and result[-1]["type"] in ("ROOT", "SUFFIX", "BPE")
if not prev_ok:
result.append(tok)
continue
sl = stripped.lower()
label = next((lbl for surf, lbl in _SUFFIX_MAP_SORTED if sl == surf), None)
if label:
result.append({
"token": raw,
"type": "SUFFIX",
"_reclassified": True,
"_suffix_label": label,
**{k: v for k, v in tok.items() if k not in ("token", "type")},
})
else:
result.append(tok)
return result