"""Fix 3: BPE → SUFFIX reclassification. Fix 5: Punctuation → PUNCT.""" from __future__ import annotations PUNCT_CHARS = set( '?.,;:!-\u2013\u2014()[]{}"`/\\|@#$%^&*+=<>~' '\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a' '\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7' ) _PUNCT_DIGITS = set("0123456789") def _is_punct(token: str) -> bool: s = token.strip() if not s: return False return all( c in PUNCT_CHARS or c in _PUNCT_DIGITS or (ord(c) > 0x02FF and not c.isalpha()) for c in s ) # ── Suffix dictionary (260+ entries) ───────────────────────────────────────── EXTENDED_SUFFIX_MAP: dict[str, str] = { # Plural + case "leri": "-PL+ACC", "ları": "-PL+ACC", "lere": "-PL+DAT", "lara": "-PL+DAT", "lerin": "-PL+GEN", "ların": "-PL+GEN", "lerde": "-PL+LOC", "larda": "-PL+LOC", "lerden": "-PL+ABL","lardan": "-PL+ABL", "lerle": "-PL+INS", "larla": "-PL+INS", "lerce": "-PL+EQU", "larca": "-PL+EQU", # -yon / loanword suffixes "yon": "-YON", "iyon": "-YON", "asyon": "-YON", "izasyon": "-YON", # Adjective derivation "al": "-ADJ", "el": "-ADJ", "ik": "-ADJ", "sal": "-ADJ.TR", "sel": "-ADJ.TR", # 1st/2nd plural possessive "imiz": "-P1PL", "ımız": "-P1PL", "umuz": "-P1PL", "ümüz": "-P1PL", "iniz": "-P2PL", "ınız": "-P2PL", "unuz": "-P2PL", "ünüz": "-P2PL", # Arabic long vowels "\u00e2": "-LONG_A", "\u00ee": "-LONG_I", "\u00fb": "-LONG_U", # Roman numerals "ii": "-ROM", "iii": "-ROM", "iv": "-ROM", "vi": "-ROM", "vii": "-ROM", "viii": "-ROM", "ix": "-ROM", "xi": "-ROM", "xii": "-ROM", "xiii": "-ROM", "xiv": "-ROM", "xv": "-ROM", # Frequent BPE pieces "eri": "-PL.SFX", "una": "-P3+DAT", "iril": "-PASS.SFX", "yan": "-PART.ACT","ren": "-PART.ACT", "ıda": "-LOC.SFX", "maya": "-NEG.INF", "üler": "-PL.SFX", "ıler": "-PL.SFX", "ni": "-ACC.SFX", "ri": "-PL.SFX", "lan": "-PASS+NZ", "on": "-YON.SFX", # Possessive + case compounds "ımı": "-P1+ACC", "imi": "-P1+ACC", "umu": "-P1+ACC", "ümü": "-P1+ACC", "ıyla": "-INS.COMP","iyle": "-INS.COMP","uyla": "-INS.COMP","üyle": "-INS.COMP", "kten": "-ABL.COMP","ğından": "-ABL.COMP","ğinden": "-ABL.COMP", "yla": "-COM", "yle": "-COM", # Abstract noun + possessive "liği": "-ABSTR+P3", "lığı": "-ABSTR+P3", "luğu": "-ABSTR+P3", "lüğü": "-ABSTR+P3", "liğini": "-ABSTR+P3+ACC", "lığını": "-ABSTR+P3+ACC", # -izm (ideology) "izm": "-ISM", "izmi": "-ISM+P3", "izmde": "-ISM+LOC", "izmden": "-ISM+ABL", "izmin": "-ISM+GEN", # Aorist "lir": "-AOR3SG", "lır": "-AOR3SG", "lur": "-AOR3SG", "lür": "-AOR3SG", # 3sg possessive + case "ine": "-P3+DAT", "ına": "-P3+DAT", "une": "-P3+DAT", "üne": "-P3+DAT", "inde": "-P3+LOC", "ında": "-P3+LOC", "unda": "-P3+LOC", "ünde": "-P3+LOC", "ini": "-P3+ACC", "ını": "-P3+ACC", "unu": "-P3+ACC", "ünü": "-P3+ACC", "inden": "-P3+ABL","ından": "-P3+ABL","undan": "-P3+ABL","ünden": "-P3+ABL", # -daki "daki": "-LOC+REL","deki": "-LOC+REL","taki": "-LOC+REL","teki": "-LOC+REL", # Passive + nominalization "lan": "-PASS+NZ", "len": "-PASS+NZ", # Verbal noun "mesi": "-VN3", "ması": "-VN3", "mesini": "-VN3+ACC", "masını": "-VN3+ACC", "mesine": "-VN3+DAT", "masına": "-VN3+DAT", "mesinde": "-VN3+LOC", "masında": "-VN3+LOC", # Genitive + possessive "ının": "-GEN+P", "inin": "-GEN+P", "unun": "-GEN+P", "ünün": "-GEN+P", # Participle "diği": "-PART", "dığı": "-PART", "tiği": "-PART", "tığı": "-PART", "duğu": "-PART", "düğü": "-PART", "tuğu": "-PART", "tüğü": "-PART", "ği": "-PART.SFX","ğı": "-PART.SFX","gu": "-PART.SFX","gü": "-PART.SFX", # Negative verbal noun "mas": "-NEG.VN", "mes": "-NEG.VN", # 2sg imperative "sin": "-IMP2", "sın": "-IMP2", "sun": "-IMP2", "sün": "-IMP2", # Passive short "ıl": "-PASS", "il": "-PASS", "ul": "-PASS", "ül": "-PASS", # Causative + VN "irme": "-CAUS+VN","ırma": "-CAUS+VN","urma": "-CAUS+VN", "ürme": "-CAUS+VN","erme": "-CAUS+VN","arma": "-CAUS+VN", # Accusative "ı": "-ACC", "i": "-ACC", "u": "-ACC", "ü": "-ACC", # Past tense "dım": "-DI1SG","dim": "-DI1SG","dum": "-DI1SG","düm": "-DI1SG", "tım": "-DI1SG","tim": "-DI1SG","tum": "-DI1SG","tüm": "-DI1SG", "dık": "-DI1PL","dik": "-DI1PL","duk": "-DI1PL","dük": "-DI1PL", "tık": "-DI1PL","tik": "-DI1PL","tuk": "-DI1PL","tük": "-DI1PL", "dın": "-DI2SG","din": "-DI2SG","dun": "-DI2SG","dün": "-DI2SG", "tın": "-DI2SG","tin": "-DI2SG","tun": "-DI2SG","tün": "-DI2SG", "d": "-PAST", "t": "-PAST", # Conditional "sa": "-COND", "se": "-COND", # Progressive "yor": "-PROG", # Simple past "dı": "-PST", "di": "-PST", "du": "-PST", "dü": "-PST", "tı": "-PST", "ti": "-PST", "tu": "-PST", "tü": "-PST", # Aorist short "ir": "-AOR", "ır": "-AOR", "ur": "-AOR", "ür": "-AOR", "er": "-AOR", "ar": "-AOR", # Evidential past "mış": "-EVID","miş": "-EVID","muş": "-EVID","müş": "-EVID", # Negation "ma": "-NEG", "me": "-NEG", "lama": "-VN+NEG","leme": "-VN+NEG", # Abilitative "bil": "-ABIL", # Necessitative "malı": "-NECES","meli": "-NECES", # Infinitive "mak": "-INF", "mek": "-INF", # -ken (while/when) "ken": "-WHEN", # Converb "arak": "-CONV","erek": "-CONV", # With / without "lı": "-WITH", "li": "-WITH", "lu": "-WITH", "lü": "-WITH", # Agentive "cı": "-AGT", "ci": "-AGT", "cu": "-AGT", "cü": "-AGT", "çı": "-AGT", "çi": "-AGT", "çu": "-AGT", "çü": "-AGT", # Abstract noun "lık": "-ABSTR","lik": "-ABSTR","luk": "-ABSTR","lük": "-ABSTR", "lığ": "-ABSTR","liğ": "-ABSTR", # Optative 1pl "elim": "-OPT1PL","alım": "-OPT1PL", # Person suffixes "ım": "-1SG", "im": "-1SG", "um": "-1SG", "üm": "-1SG", "ın": "-2SG", "in": "-2SG", "un": "-2SG", "ün": "-2SG", "iz": "-1PL", "ız": "-1PL", "uz": "-1PL", "üz": "-1PL", "nız": "-2PL","niz": "-2PL","nuz": "-2PL","nüz": "-2PL", # Question "mı": "-Q", "mi": "-Q", "mu": "-Q", "mü": "-Q", # Dative "a": "-DAT", "e": "-DAT", "ya": "-DAT", "ye": "-DAT", # Ablative "dan": "-ABL","den": "-ABL","tan": "-ABL","ten": "-ABL", # Locative "da": "-LOC", "de": "-LOC", "ta": "-LOC", "te": "-LOC", # Plural "lar": "-PL", "ler": "-PL", # 3sg possessive short "sı": "-P3", "si": "-P3", "su": "-P3", "sü": "-P3", # Genitive "nin": "-GEN","nın": "-GEN","nun": "-GEN","nün": "-GEN", # Instrumental "le": "-INS", "la": "-INS", # Equative "ce": "-EQU","ca": "-EQU","çe": "-EQU","ça": "-EQU", # Glide "y": "-GLIDE", } _SUFFIX_MAP_SORTED = sorted( EXTENDED_SUFFIX_MAP.items(), key=lambda x: len(x[0]), reverse=True ) def reclassify_bpe_suffixes(tokens: list[dict]) -> list[dict]: """Reclassify BPE tokens: punctuation → PUNCT, word-internal suffixes → SUFFIX.""" result: list[dict] = [] for tok in tokens: if tok["type"] != "BPE": result.append(tok) continue raw = tok["token"] stripped = raw.strip() if _is_punct(raw): result.append({**tok, "type": "PUNCT", "_punct": True}) continue # Only reclassify tokens without a leading space (word-internal) if raw != stripped: result.append(tok) continue prev_ok = bool(result) and result[-1]["type"] in ("ROOT", "SUFFIX", "BPE") if not prev_ok: result.append(tok) continue sl = stripped.lower() label = next((lbl for surf, lbl in _SUFFIX_MAP_SORTED if sl == surf), None) if label: result.append({ "token": raw, "type": "SUFFIX", "_reclassified": True, "_suffix_label": label, **{k: v for k, v in tok.items() if k not in ("token", "type")}, }) else: result.append(tok) return result