import json import os from enum import Enum from typing import Dict, List, Optional, Tuple, Union from transformers import PreTrainedTokenizer __version__ = "1.0.3" # Fixed vocab merge order class TurkishDecoder: # Define vowel sets as class constants for better performance ALL_VOWELS = "aeıioöuüâ" INCE_VOWELS = "eiöü" # Front vowels AI_VOWELS = "aıâ" # Back unrounded EI_VOWELS = "ei" # Front unrounded OU_VOWELS = "ou" # Back rounded HARD_CONSONANTS = "fstkçşhp" # Sert ünsüzler WHITESPACE = " \n\t" def __init__(self, reverse_dict): self.reverse_dict = reverse_dict def _tr_capitalize(self, word: str) -> str: """Capitalize using Turkish casing rules (i -> İ, ı -> I).""" if not word: return "" if word.startswith("i"): return "İ" + word[1:] return word.capitalize() def _starts_with_vowel(self, word: str) -> bool: """Check if word starts with a vowel.""" return bool(word and word[0] in self.ALL_VOWELS) def _ends_with_vowel(self, word: str) -> bool: """Check if word ends with a vowel.""" return bool(word and word[-1] in self.ALL_VOWELS) def _ends_with_any(self, word: str, charset: str) -> bool: # recursively check until first vowel starts from the end i = len(word) - 1 while i >= 0: if word[i] in charset: return True if word[i] in self.ALL_VOWELS: return False i -= 1 return False def _ends_with_ince(self, word: str) -> bool: """Check if word ends with front vowels (ince ünlü).""" if word in ("saat", "kilovatsaat", "ziraat", "itaat", "istikbal"): return True # check until first vowel recursively return self._ends_with_any(word, self.INCE_VOWELS) def _ends_with_sert_unsuz(self, word: str) -> bool: """Check if word ends with a hard consonant.""" return bool(word and word[-1] in self.HARD_CONSONANTS) def _get_vowel_suffix_index(self, prev_token: str) -> int: """Get suffix index based on vowel harmony rules.""" if self._ends_with_any(prev_token, self.AI_VOWELS): return 0 elif self._ends_with_any(prev_token, self.EI_VOWELS): return 1 elif self._ends_with_any(prev_token, self.OU_VOWELS): return 2 return 3 def _select_correct_suffix(self, i: int, ids: List[int], prev_token: str) -> str: """Select the correct suffix based on morphological rules.""" suffixes = self.reverse_dict[ids[i]] token_id = ids[i] # Handle different suffix types with cleaner logic if token_id < 20013: # Basic suffix selection based on vowel harmony return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0] elif token_id < 20023: # nın, nin, nun, nün return suffixes[self._get_vowel_suffix_index(prev_token)] elif token_id == 20023: # la, le, yla, yle end_of_word = True if i < len(ids) - 1: next_token = self.reverse_dict[ids[i + 1]][0] if next_token not in self.WHITESPACE: end_of_word = False return self._handle_la_le_suffix(prev_token, suffixes, end_of_word) elif token_id <= 20025: # da, de, ta, te, dan, den, tan, ten return self._handle_da_de_suffix(prev_token, suffixes) elif 20025 < token_id < 20029: # dı, di, du, dü, tı, ti, tu, tü, etc. return self._handle_di_du_suffix(prev_token, suffixes) elif token_id == 20029: # lık, lik, luk, lük, etc. return self._handle_lik_suffix(i, ids, prev_token, suffixes) elif token_id == 20030: # cık, cik, cuk, cük, etc. return self._handle_cik_suffix(i, ids, prev_token, suffixes) elif token_id == 20031: # mak, mek, may, mey return self._handle_mak_suffix(i, ids, prev_token, suffixes) elif token_id == 20032: # acak, ecek, etc. return self._handle_acak_suffix(i, ids, prev_token, suffixes) return suffixes[0] def _handle_la_le_suffix( self, prev_token: str, suffixes: List[str], end_of_word: bool ) -> str: """Handle la/le/yla/yle suffix selection.""" if self._ends_with_vowel(prev_token) and end_of_word: return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2] else: return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0] def _handle_da_de_suffix(self, prev_token: str, suffixes: List[str]) -> str: """Handle da/de/ta/te suffix selection.""" if self._ends_with_sert_unsuz(prev_token): return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2] return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0] def _handle_di_du_suffix(self, prev_token: str, suffixes: List[str]) -> str: """Handle dı/di/du/dü suffix selection.""" base_index = self._get_vowel_suffix_index(prev_token) return ( suffixes[base_index + 4] if self._ends_with_sert_unsuz(prev_token) else suffixes[base_index] ) def _handle_lik_suffix( self, i: int, ids: List[int], prev_token: str, suffixes: List[str] ) -> str: """Handle lık/lik/luk/lük suffix selection.""" if i >= len(ids) - 1: return suffixes[0] next_token = self.reverse_dict[ids[i + 1]][0] base_index = self._get_vowel_suffix_index(prev_token) return ( suffixes[base_index + 4] if self._starts_with_vowel(next_token) else suffixes[base_index] ) def _handle_cik_suffix( self, i: int, ids: List[int], prev_token: str, suffixes: List[str] ) -> str: """Handle cık/cik/cuk/cük suffix selection.""" if i >= len(ids) - 1: return suffixes[0] next_token = self.reverse_dict[ids[i + 1]][0] base_index = self._get_vowel_suffix_index(prev_token) if self._starts_with_vowel(next_token): offset = 12 if self._ends_with_sert_unsuz(prev_token) else 8 else: offset = 4 if self._ends_with_sert_unsuz(prev_token) else 0 return suffixes[base_index + offset] def _handle_mak_suffix( self, i: int, ids: List[int], prev_token: str, suffixes: List[str] ) -> str: """Handle mak/mek/may/mey suffix selection.""" if i >= len(ids) - 1: return suffixes[0] next_token = self.reverse_dict[ids[i + 1]][0] base_index = 1 if self._ends_with_ince(prev_token) else 0 return ( suffixes[base_index + 2] if self._starts_with_vowel(next_token) else suffixes[base_index] ) def _handle_acak_suffix( self, i: int, ids: List[int], prev_token: str, suffixes: List[str] ) -> str: """Handle acak/ecek/yacak/yecek suffix selection.""" is_vowel_ending = self._ends_with_vowel(prev_token) is_ince = self._ends_with_ince(prev_token) is_vowel_starting = False if i < len(ids) - 1: next_token = self.reverse_dict[ids[i + 1]][0] is_vowel_starting = self._starts_with_vowel(next_token) if is_vowel_starting: if is_vowel_ending: return suffixes[7] if is_ince else suffixes[6] else: return suffixes[3] if is_ince else suffixes[2] else: if is_vowel_ending: return suffixes[5] if is_ince else suffixes[4] else: return suffixes[1] if is_ince else suffixes[0] def _select_correct_root(self, i: int, ids: List[int]) -> str: """Select the correct root form based on morphological context.""" token_id = ids[i] tokens = self.reverse_dict[token_id] if i > len(ids) - 2: return tokens[0] next_token = self.reverse_dict[ids[i + 1]][0] # === EXCEPTIONS: Roots that should NOT soften === # These roots end in consonants that look like they should soften # but actually stay unchanged before vowel-initial suffixes NO_SOFTENING_ROOTS = { 204, # hayat - hayatı (not hayatı -> hayadi) 220, # belirt - belirten (not belirden) 298, # meslek - mesleki (not mesleği) } if token_id in NO_SOFTENING_ROOTS: return tokens[0] # === EXCEPTIONS: Roots where default is variant[1], not variant[0] === # These have multiple forms but the common surface form is the second one DEFAULT_VARIANT_1_ROOTS = { 2227, # üçlü (not üçle) 2209, # yaşı (special handling below) } # Special case: üçlü - always return üçlü (variant 1) unless specific context if token_id == 2227: return tokens[1] if len(tokens) > 1 else tokens[0] # Akış (aka/akı) Exception (2199) - Default to "akı" (variant 1) # "aka" is only used in specific contexts like "akacak" if token_id == 2199: if i < len(ids) - 1: next_str = self.reverse_dict[ids[i + 1]][0] # Use "aka" only when followed by vowel-starting suffixes like "acak" if next_str.startswith("a") or next_str.startswith("e"): return tokens[0] # "aka" for "akacak" # Default to "akı" for all other cases return tokens[1] if len(tokens) > 1 else tokens[0] # Ata/Atı Exception (2212) - for "atılırsa", "atılmak", "atıyorlar" etc. # Use "atı" (variant 1) when followed by 'l' (passive) or 'y' (yor, yacak) if token_id == 2212: if len(tokens) > 1 and i < len(ids) - 1: next_str = self.reverse_dict[ids[i + 1]][0] if next_str.strip().startswith("l") or next_str.strip().startswith("y"): return tokens[ 1 ] # "atı" + "lırsa" = "atılırsa", "atı" + "yorlar" = "atıyorlar" return tokens[0] # "ata" by default # Special case: yaşı/yaşa - return yaşı before 'na' suffix if token_id == 2209: if i < len(ids) - 1: # 20188 = 'na' if ids[i + 1] == 20188: return tokens[1] if len(tokens) > 1 else tokens[0] return tokens[0] # Alın (alın/aln) Exception (182) - Default to "alın" (variant 0) # Only use "aln" when followed by possessive vowel suffix if token_id == 182: if i < len(ids) - 1: next_id = ids[i + 1] # Only drop vowel for simple possessive suffixes # 20034 = 'ı', 20033 = 'i', 20035 = 'u', 20036 = 'ü' if next_id in (20034, 20033, 20035, 20036): return tokens[1] if len(tokens) > 1 else tokens[0] # "aln" + ı # Keep "alın" for all other cases return tokens[0] # Ilim/Ilm Exception (166) - Default to "ilim" (variant 0) # Only use "ilm" when followed by single-vowel possessive suffix if token_id == 166: if len(tokens) > 1 and i < len(ids) - 1: next_id = ids[i + 1] # Only use "ilm" for possessive/buffer case (ilmi, ilme) if next_id in (20033, 20038): # 'i', 'e' return tokens[1] # "ilm" + i = "ilmi" return tokens[0] # Default to "ilim" # Boya/Boyu Exception (2220) - "boya" (paint) vs "boyu" (height) # Use "boyu" (variant 1) by default # Use "boya" only for paint-related suffix patterns (boyanan, boyamak, boyalı, etc.) if token_id == 2220: if len(tokens) > 1 and i < len(ids) - 1: next_id = ids[i + 1] next_str = self.reverse_dict[next_id][0] # Use "boya" only when followed by actual suffix tokens starting with 'n', 'm', 'l', 'd' # (boyanan, boyamak, boyalı, boyadan) - these are paint-related contexts if ( next_id >= 20000 and next_str.strip() and next_str.strip()[0] in "nmld" ): return tokens[0] # "boya" return tokens[1] if len(tokens) > 1 else tokens[0] # "boyu" by default # Bile/Bili Exception (2307) - for "bilir", "biliyor" vs "biler", "beleyor" # Use "bili" (variant 1) when followed by 'r' or 'yor' if token_id == 2307: if len(tokens) > 1 and i < len(ids) - 1: next_str = self.reverse_dict[ids[i + 1]][0] if next_str.strip().startswith("r") or next_str.strip() == "yor": return tokens[ 1 ] # "bili" + "r" = "bilir", "bili" + "yor" = "biliyor" return tokens[0] # Default to "bile" # Ada/Adı Exception (2218) - Default to "adı" (variant 1) if token_id == 2218: if i < len(ids) - 1: next_id = ids[i + 1] next_str = self.reverse_dict[next_id][0] # Use "ada" when followed by 'n' suffixes or 'yı' (for adayı pattern) or 'ma' (adama) # 20017 = suffix yı, 32725 = BPE yı, 20002 = ma/me if ( next_id == 20040 or next_str.startswith("n") or next_id in (20017, 32725, 20002, 32763) ): return tokens[0] # "ada" for "adanın", "adayı", "adama" # Default to "adı" for most cases return tokens[1] if len(tokens) > 1 else tokens[0] # Kap/Kab Exception (336) - favor "kapı" (door) over "kab" (container) context # "kapımızı" (our door) tokenizes as kap + ımız + ı -> default softens to kabımızı # So we prevent softening for 336 in potential door contexts if token_id == 336: if len(tokens) > 1 and i < len(ids) - 1: next_str = self.reverse_dict[ids[i + 1]][0] # If followed by vowel (which causes softening default), check if it looks like possessive plural # kap + ımız -> kapımız (door) vs kabımız (container) # We prioritize "door" (kap) as it's more common if self._starts_with_vowel(next_str): return tokens[0] # Keep "kap" return tokens[0] # Default "kap" # Emekli/Emekle Exception (2295) - Default to "emekli" (variant 1) if token_id == 2295: if i < len(ids) - 1: next_id = ids[i + 1] # 20041 = 'yor' - for "emekliyor" use base form if next_id == 20041: return tokens[0] # "emekle" + yor = emekliyor # Default to "emekli" return tokens[1] if len(tokens) > 1 else tokens[0] # Tutuk/Tutuğ/Tutk Exception (107) - for "tutkun" (fan/devotee) # Use "tutk" (variant 2) when followed by suffix starting with 'u' (for un/unlar etc.) # Otherwise use default "tutuk" (don't soften to tutuğ) if token_id == 107: if len(tokens) > 2 and i < len(ids) - 1: next_str = self.reverse_dict[ids[i + 1]][0] # Check if next token starts with 'u' (un, unlar, etc.) if next_str.strip().startswith("u"): return tokens[2] # "tutk" + "un" = "tutkun" # For other cases, use default form (tutuk), not softened (tutuğ) return tokens[0] # Başla/Başlı Exception (2206) - for "başlıca" (primary/mainly) # Use "başlı" (variant 1) when followed by 'ca/ce' suffix if token_id == 2206: if len(tokens) > 1 and i < len(ids) - 1: next_id = ids[i + 1] # 20005 = 'ça/çe' suffix, 20047 = 'ce', 20207 = BPE 'ca' if next_id in (20005, 20047, 20207): return tokens[1] # "başlı" + "ca" = "başlıca" # Continue to existing logic below # Dip/Dib Exception (2406) - soften to "dib" before vowel suffixes # (dibinde, dibini, etc.) Token 2406 is outside the 100-2080 range if token_id == 2406: if len(tokens) > 1 and i < len(ids) - 1: next_str = self.reverse_dict[ids[i + 1]][0] if self._starts_with_vowel(next_str.strip()): return tokens[1] # "dib" + "inde" = "dibinde" return tokens[0] # "dip" by default if token_id in [19531, 19968]: # de, ye # Special handling for de/ye narrowing # de -> di, ye -> yi when followed by yor or variable suffixes starting with vowel (which get 'y' buffer) should_narrow = False if next_token.strip() == "yor": should_narrow = True elif ids[i + 1] in self.reverse_dict: # Check if next suffix starts with vowel, invoking 'y' buffer # e.g. acak/ecek -> yacak/yecek suff_forms = self.reverse_dict[ids[i + 1]] if suff_forms and any( s.startswith(("a", "e", "ı", "i", "u", "ü", "o", "ö")) for s in suff_forms ): should_narrow = True if should_narrow: # Replace last char e -> i # Handle space prefix original = tokens[0] if original.endswith("e"): return original[:-1] + "i" elif original.endswith("E"): return original[:-1] + "İ" return tokens[0] if 100 <= token_id < 2080: # Skip softening for roots in NO_SOFTENING_ROOTS (already handled above) if self._starts_with_vowel(next_token): return tokens[1] elif token_id <= 110 and next_token.strip() == "ı": return tokens[2] else: return tokens[0] elif 2080 <= token_id < 2315: if next_token.strip() == "yor": return tokens[1] else: return tokens[0] return tokens[0] def decode(self, ids: List[int]) -> str: """Decode a list of token IDs to text.""" if not ids: return "" text_parts = [] i = 0 while i < len(ids): token_id = ids[i] # Handle special tokens if token_id == 0 and i < len(ids) - 1: # uppercase next_token = self._select_correct_root(i + 1, ids) if next_token.startswith(" "): text_parts.append(" " + self._tr_capitalize(next_token.lstrip())) else: text_parts.append(self._tr_capitalize(next_token)) i += 2 continue elif token_id == 1: # unknown text_parts.append("▁u▁") elif token_id in self.reverse_dict: tokens = self.reverse_dict[token_id] if len(tokens) > 1: if token_id < 20000: # root token text_parts.append(self._select_correct_root(i, ids)) else: # suffix token # Find context from previous tokens # We need enough context for both vowel harmony (looking back past consonants) # and consonant harmony (immediate previous char) prev_token = "" j = len(text_parts) - 1 tokens_found = 0 # Look back up to 3 tokens or until we have enough context temp_context = [] while j >= 0 and tokens_found < 3: part = text_parts[j] temp_context.insert(0, part) if any(c.isalpha() for c in part): tokens_found += 1 j -= 1 if temp_context: prev_token = "".join(temp_context) text_parts.append( self._select_correct_suffix(i, ids, prev_token) ) else: text_parts.append(tokens[0]) else: text_parts.append("▁") i += 1 return "".join(text_parts) class TokenType(Enum): ROOT = "ROOT" SUFFIX = "SUFFIX" BPE = "BPE" class TurkishTokenizer: def __init__(self, kokler_file=None, ekler_file=None, bpe_file=None): # Get the directory where this module is located package_dir = os.path.dirname(os.path.abspath(__file__)) # Use provided files or fall back to package resource paths if kokler_file is None: kokler_file = os.path.join(package_dir, "vocabs/kokler.json") if ekler_file is None: ekler_file = os.path.join(package_dir, "vocabs/ekler.json") if bpe_file is None: bpe_file = os.path.join(package_dir, "vocabs/bpe_tokenler.json") # Load JSON files with open(kokler_file, "r", encoding="utf-8") as f: roots = json.load(f) with open(ekler_file, "r", encoding="utf-8") as f: suffixes = json.load(f) with open(bpe_file, "r", encoding="utf-8") as f: bpe_tokens = json.load(f) # Store the dictionaries as instance attributes self.roots = roots self.suffixes = suffixes self.bpe_tokens = bpe_tokens # Now create vocab and reverse dict self.vocab = self.get_vocab() self.reverse_dict = {} # Helper to populate reverse dict def add_to_reverse(source_dict): for key, value in source_dict.items(): if value not in self.reverse_dict: self.reverse_dict[value] = [] # Avoid duplicates if key not in self.reverse_dict[value]: self.reverse_dict[value].append(key) add_to_reverse(self.roots) add_to_reverse(self.suffixes) add_to_reverse(self.bpe_tokens) self.decoder = TurkishDecoder(self.reverse_dict) self.vocab_size = len(self.reverse_dict) self.max_root_len = max(len(k) for k in roots) if roots else 0 self.max_suffix_len = max(len(k) for k in suffixes) if suffixes else 0 self.max_bpe_len = max(len(k) for k in bpe_tokens) if bpe_tokens else 0 # Filter BPE tokens: remove any that also exist in suffixes # This guarantees suffix priority (matching Rust behavior) self.bpe_tokens_filtered = { k: v for k, v in bpe_tokens.items() if k not in suffixes } self.uppercase_marker = { "token": "", "id": roots[""], "type": TokenType.ROOT, } self.unknown_marker = { "token": "", "id": roots[""], "type": TokenType.ROOT, } self.space_marker = {"token": " ", "id": roots[" "], "type": TokenType.ROOT} # added to be compatible with SFTTrainer self.pad_token = "" self.eos_token = "" self.pad_token_id = roots[self.pad_token] self.eos_token_id = roots[self.eos_token] # added to be compatible with SFTTrainer def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: return [self.vocab[token] for token in tokens] # added to be compatible with SFTTrainer def convert_ids_to_tokens(self, ids: List[int]) -> List[str]: # CORRECTED: Return the first variant string, not the list of variants return [self.reverse_dict[id][0] for id in ids] def get_vocab(self) -> Dict[str, int]: # Order matters: suffixes AFTER bpe_tokens so suffixes win on conflicts return {**self.roots, **self.bpe_tokens, **self.suffixes} def _tokenize_word(self, word: str) -> Tuple[List[dict], List[int]]: uppercase_indices = [i for i, c in enumerate(word) if c.isupper()] result = [] segments = self._camel_split_with_positions(word) for seg, orig_pos in segments: if orig_pos < len(word) and word[orig_pos].isupper(): result.append(self.uppercase_marker) # Only prepend space if at start and not whitespace should_add_space = orig_pos == 0 and not seg.isspace() if should_add_space: seg = " " + seg s = self._tr_lower(seg) pos = 0 while pos < len(s): substr = s[pos:] r_matches = self._all_prefix_matches( substr, self.roots, self.max_root_len ) b_matches = self._all_prefix_matches( substr, self.bpe_tokens_filtered, self.max_bpe_len ) s_matches = self._all_prefix_matches( substr, self.suffixes, self.max_suffix_len ) candidates = [] for r_id, r_tok in r_matches: candidates.append(("ROOT", r_tok, r_id, len(r_tok), TokenType.ROOT)) for b_id, b_tok in b_matches: candidates.append(("BPE", b_tok, b_id, len(b_tok), TokenType.BPE)) for s_id, s_tok in s_matches: candidates.append( ("SUFFIX", s_tok, s_id, len(s_tok), TokenType.SUFFIX) ) if not candidates: result.append(self.unknown_marker) pos += 1 continue best_candidate = None best_score = -1 for c_type, c_tok, c_id, c_len, c_enum in candidates: score = c_len remainder = substr[c_len:] if not remainder: # Full match bonus score += 5 else: # Follow-up suffix bonus s_next_id, s_next_tok = self._longest_prefix_lookup( remainder, self.suffixes, self.max_suffix_len ) if s_next_id is not None: # Ignore 1-char variants to prefer atomic roots (e.g. Kapı vs Kap+ı) if len(s_next_tok) > 1: score += len(s_next_tok) if score > best_score: best_score = score best_candidate = (c_tok, c_id, c_enum) elif score == best_score: # Tie-break Priority: Root > Suffix > BPE if ( c_enum == TokenType.ROOT and best_candidate[2] != TokenType.ROOT ): best_candidate = (c_tok, c_id, c_enum) elif ( c_enum == TokenType.SUFFIX and best_candidate[2] == TokenType.BPE ): best_candidate = (c_tok, c_id, c_enum) result.append( { "token": best_candidate[0], "id": best_candidate[1], "type": best_candidate[2], } ) pos += len(best_candidate[0]) continue result.append(self.unknown_marker) pos += 1 return result, uppercase_indices def tokenize_text(self, text: str) -> Tuple[List[dict], List[int]]: final_tokens = [] uppercase_indices = [i for i, c in enumerate(text) if c.isupper()] parts = text.split(" ") for idx, part in enumerate(parts): part = part.strip() # part = " " + part if part.strip(): tokens, _ = self._tokenize_word(part) cleaned_tokens = [] for i, token in enumerate(tokens): if ( i >= 2 and not (0 <= token["id"] <= 19999) and tokens[i - 2] == self.uppercase_marker and tokens[i - 1] == self.space_marker ): cleaned_tokens.pop(-1) # If this token is uppercase_marker, check previous token if ( token == self.uppercase_marker and len(cleaned_tokens) > 0 and cleaned_tokens[-1] == self.space_marker ): should_pop = False if i + 1 < len(tokens): next_tok_str = tokens[i + 1]["token"] if next_tok_str.startswith(" "): should_pop = True if should_pop: cleaned_tokens.pop() # remove the last " " before uppercase cleaned_tokens.append(token) final_tokens.extend(cleaned_tokens) return final_tokens, uppercase_indices def encode(self, text: str) -> List[int]: tokens, _ = self.tokenize_text(text) return [t["id"] for t in tokens] def tokenize(self, text: str) -> List[str]: tokens, _ = self.tokenize_text(text) return [t["token"] for t in tokens] def _longest_prefix_lookup( self, s: str, table: Dict[str, int], max_len: int = None ) -> Tuple[Optional[int], str]: end = min(len(s), max_len) if max_len else len(s) for i in range(end, 0, -1): cand = s[:i] if cand in table: return table[cand], cand return None, "" def _all_prefix_matches( self, s: str, table: Dict[str, int], max_len: int = None ) -> List[Tuple[int, str]]: matches = [] end = min(len(s), max_len) if max_len else len(s) for i in range(end, 0, -1): prefix = s[:i] if prefix in table: matches.append((table[prefix], prefix)) return matches def _tr_lower(self, word: str) -> str: if "I" in word or "İ" in word: word = word.replace("İ", "i").replace("I", "ı") return word.lower() def _camel_split_with_positions(self, word: str) -> List[Tuple[str, int]]: if not word: return [] parts = [] start = 0 for i in range(1, len(word)): if word[i].isupper(): if start < i: parts.append((self._tr_lower(word[start:i]), start)) start = i if start < len(word): parts.append((self._tr_lower(word[start:]), start)) return parts if parts else [(self._tr_lower(word), 0)] def decode(self, ids: List[int]) -> str: return self.decoder.decode(ids) # added to be compatible with SFTTrainer def __call__(self, text: str) -> Dict[str, List[int]]: input_ids = self.encode(text) attention_mask = [1 for _ in input_ids] return {"input_ids": input_ids, "attention_mask": attention_mask} class TurkishMFTTokenizerHF(PreTrainedTokenizer): """ Hugging Face Transformers capable wrapper. The main tokenization logic is in TurkishTokenizer. """ model_input_names = ["input_ids", "attention_mask"] vocab_files_names = { "kokler_file": "vocabs/kokler.json", "ekler_file": "vocabs/ekler.json", "bpe_file": "vocabs/bpe_tokenler.json", } def __init__( self, kokler_file=None, ekler_file=None, bpe_file=None, **kwargs, ): # Initialize the actual tokenizer self._tok = TurkishTokenizer( kokler_file=kokler_file, ekler_file=ekler_file, bpe_file=bpe_file ) # HF sometimes requires vocab_size self._vocab = self._tok.get_vocab() # IMPORTANT: PreTrainedTokenizer manages special tokens # Your vocab already has , . super().__init__( pad_token=kwargs.pop("pad_token", ""), eos_token=kwargs.pop("eos_token", ""), unk_token=kwargs.pop("unk_token", ""), **kwargs, ) # Ids self.pad_token_id = self._tok.pad_token_id self.eos_token_id = self._tok.eos_token_id self.unk_token_id = self._tok.roots.get("", 1) @property def vocab_size(self) -> int: return len(self._tok.reverse_dict) def get_vocab(self) -> Dict[str, int]: return dict(self._vocab) # HF's base function to call def _tokenize(self, text: str) -> List[str]: # You can modify behavior here: normalize, replace, etc. # e.g.: text = text.replace("naber", "ne haber") return self._tok.tokenize(text) def _convert_token_to_id(self, token: str) -> int: return self._vocab.get(token, self.unk_token_id) def _convert_id_to_token(self, index: int) -> str: # reverse_dict: id -> [variants] if index in self._tok.reverse_dict: return self._tok.reverse_dict[index][0] return self.unk_token def convert_tokens_to_string(self, tokens: List[str]) -> str: # Your decode uses ids; let's not just join simply. # HF uses this function sometimes. You can do tokens->ids->decode. ids = [self._convert_token_to_id(t) for t in tokens] return self._tok.decode(ids) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: # You can add BOS/EOS here if you want. if token_ids_1 is None: return token_ids_0 return token_ids_0 + token_ids_1 def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None ) -> Tuple[str, ...]: # Create vocabs subdirectory vocabs_dir = os.path.join(save_directory, "vocabs") os.makedirs(vocabs_dir, exist_ok=True) prefix = filename_prefix or "" # Save all three vocabulary files kokler_path = os.path.join(vocabs_dir, f"{prefix}kokler.json") ekler_path = os.path.join(vocabs_dir, f"{prefix}ekler.json") bpe_path = os.path.join(vocabs_dir, f"{prefix}bpe_tokenler.json") with open(kokler_path, "w", encoding="utf-8") as f: json.dump(self._tok.roots, f, ensure_ascii=False, indent=2) with open(ekler_path, "w", encoding="utf-8") as f: json.dump(self._tok.suffixes, f, ensure_ascii=False, indent=2) with open(bpe_path, "w", encoding="utf-8") as f: json.dump(self._tok.bpe_tokens, f, ensure_ascii=False, indent=2) return (kokler_path, ekler_path, bpe_path) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # default behavior is sufficient return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)