mft-random / tokenization_turkish_mft.py

Add new SentenceTransformer model

19be1a8 verified 2 months ago

35.8 kB

	import json
	import os
	from enum import Enum
	from typing import Dict, List, Optional, Tuple, Union

	from transformers import PreTrainedTokenizer


	__version__ = "1.0.3" # Fixed vocab merge order


	class TurkishDecoder:
	# Define vowel sets as class constants for better performance
	ALL_VOWELS = "aeıioöuüâ"
	INCE_VOWELS = "eiöü" # Front vowels
	AI_VOWELS = "aıâ" # Back unrounded
	EI_VOWELS = "ei" # Front unrounded
	OU_VOWELS = "ou" # Back rounded
	HARD_CONSONANTS = "fstkçşhp" # Sert ünsüzler
	WHITESPACE = " \n\t"

	def __init__(self, reverse_dict):
	self.reverse_dict = reverse_dict

	def _tr_capitalize(self, word: str) -> str:
	"""Capitalize using Turkish casing rules (i -> İ, ı -> I)."""
	if not word:
	return ""
	if word.startswith("i"):
	return "İ" + word[1:]
	return word.capitalize()

	def _starts_with_vowel(self, word: str) -> bool:
	"""Check if word starts with a vowel."""
	return bool(word and word[0] in self.ALL_VOWELS)

	def _ends_with_vowel(self, word: str) -> bool:
	"""Check if word ends with a vowel."""
	return bool(word and word[-1] in self.ALL_VOWELS)

	def _ends_with_any(self, word: str, charset: str) -> bool:
	# recursively check until first vowel starts from the end
	i = len(word) - 1
	while i >= 0:
	if word[i] in charset:
	return True
	if word[i] in self.ALL_VOWELS:
	return False
	i -= 1
	return False

	def _ends_with_ince(self, word: str) -> bool:
	"""Check if word ends with front vowels (ince ünlü)."""
	if word in ("saat", "kilovatsaat", "ziraat", "itaat", "istikbal"):
	return True
	# check until first vowel recursively
	return self._ends_with_any(word, self.INCE_VOWELS)

	def _ends_with_sert_unsuz(self, word: str) -> bool:
	"""Check if word ends with a hard consonant."""
	return bool(word and word[-1] in self.HARD_CONSONANTS)

	def _get_vowel_suffix_index(self, prev_token: str) -> int:
	"""Get suffix index based on vowel harmony rules."""
	if self._ends_with_any(prev_token, self.AI_VOWELS):
	return 0
	elif self._ends_with_any(prev_token, self.EI_VOWELS):
	return 1
	elif self._ends_with_any(prev_token, self.OU_VOWELS):
	return 2
	return 3

	def _select_correct_suffix(self, i: int, ids: List[int], prev_token: str) -> str:
	"""Select the correct suffix based on morphological rules."""
	suffixes = self.reverse_dict[ids[i]]
	token_id = ids[i]
	# Handle different suffix types with cleaner logic
	if token_id < 20013:
	# Basic suffix selection based on vowel harmony
	return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]

	elif token_id < 20023: # nın, nin, nun, nün
	return suffixes[self._get_vowel_suffix_index(prev_token)]

	elif token_id == 20023: # la, le, yla, yle
	end_of_word = True
	if i < len(ids) - 1:
	next_token = self.reverse_dict[ids[i + 1]][0]
	if next_token not in self.WHITESPACE:
	end_of_word = False
	return self._handle_la_le_suffix(prev_token, suffixes, end_of_word)

	elif token_id <= 20025: # da, de, ta, te, dan, den, tan, ten
	return self._handle_da_de_suffix(prev_token, suffixes)

	elif 20025 < token_id < 20029: # dı, di, du, dü, tı, ti, tu, tü, etc.
	return self._handle_di_du_suffix(prev_token, suffixes)

	elif token_id == 20029: # lık, lik, luk, lük, etc.
	return self._handle_lik_suffix(i, ids, prev_token, suffixes)

	elif token_id == 20030: # cık, cik, cuk, cük, etc.
	return self._handle_cik_suffix(i, ids, prev_token, suffixes)

	elif token_id == 20031: # mak, mek, may, mey
	return self._handle_mak_suffix(i, ids, prev_token, suffixes)

	elif token_id == 20032: # acak, ecek, etc.
	return self._handle_acak_suffix(i, ids, prev_token, suffixes)

	return suffixes[0]

	def _handle_la_le_suffix(
	self, prev_token: str, suffixes: List[str], end_of_word: bool
	) -> str:
	"""Handle la/le/yla/yle suffix selection."""
	if self._ends_with_vowel(prev_token) and end_of_word:
	return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
	else:
	return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]

	def _handle_da_de_suffix(self, prev_token: str, suffixes: List[str]) -> str:
	"""Handle da/de/ta/te suffix selection."""
	if self._ends_with_sert_unsuz(prev_token):
	return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
	return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]

	def _handle_di_du_suffix(self, prev_token: str, suffixes: List[str]) -> str:
	"""Handle dı/di/du/dü suffix selection."""
	base_index = self._get_vowel_suffix_index(prev_token)
	return (
	suffixes[base_index + 4]
	if self._ends_with_sert_unsuz(prev_token)
	else suffixes[base_index]
	)

	def _handle_lik_suffix(
	self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
	) -> str:
	"""Handle lık/lik/luk/lük suffix selection."""
	if i >= len(ids) - 1:
	return suffixes[0]

	next_token = self.reverse_dict[ids[i + 1]][0]
	base_index = self._get_vowel_suffix_index(prev_token)
	return (
	suffixes[base_index + 4]
	if self._starts_with_vowel(next_token)
	else suffixes[base_index]
	)

	def _handle_cik_suffix(
	self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
	) -> str:
	"""Handle cık/cik/cuk/cük suffix selection."""
	if i >= len(ids) - 1:
	return suffixes[0]

	next_token = self.reverse_dict[ids[i + 1]][0]
	base_index = self._get_vowel_suffix_index(prev_token)

	if self._starts_with_vowel(next_token):
	offset = 12 if self._ends_with_sert_unsuz(prev_token) else 8
	else:
	offset = 4 if self._ends_with_sert_unsuz(prev_token) else 0

	return suffixes[base_index + offset]

	def _handle_mak_suffix(
	self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
	) -> str:
	"""Handle mak/mek/may/mey suffix selection."""
	if i >= len(ids) - 1:
	return suffixes[0]

	next_token = self.reverse_dict[ids[i + 1]][0]
	base_index = 1 if self._ends_with_ince(prev_token) else 0
	return (
	suffixes[base_index + 2]
	if self._starts_with_vowel(next_token)
	else suffixes[base_index]
	)

	def _handle_acak_suffix(
	self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
	) -> str:
	"""Handle acak/ecek/yacak/yecek suffix selection."""
	is_vowel_ending = self._ends_with_vowel(prev_token)
	is_ince = self._ends_with_ince(prev_token)

	is_vowel_starting = False
	if i < len(ids) - 1:
	next_token = self.reverse_dict[ids[i + 1]][0]
	is_vowel_starting = self._starts_with_vowel(next_token)

	if is_vowel_starting:
	if is_vowel_ending:
	return suffixes[7] if is_ince else suffixes[6]
	else:
	return suffixes[3] if is_ince else suffixes[2]
	else:
	if is_vowel_ending:
	return suffixes[5] if is_ince else suffixes[4]
	else:
	return suffixes[1] if is_ince else suffixes[0]

	def _select_correct_root(self, i: int, ids: List[int]) -> str:
	"""Select the correct root form based on morphological context."""
	token_id = ids[i]
	tokens = self.reverse_dict[token_id]

	if i > len(ids) - 2:
	return tokens[0]

	next_token = self.reverse_dict[ids[i + 1]][0]

	# === EXCEPTIONS: Roots that should NOT soften ===
	# These roots end in consonants that look like they should soften
	# but actually stay unchanged before vowel-initial suffixes
	NO_SOFTENING_ROOTS = {
	204, # hayat - hayatı (not hayatı -> hayadi)
	220, # belirt - belirten (not belirden)
	298, # meslek - mesleki (not mesleği)
	}
	if token_id in NO_SOFTENING_ROOTS:
	return tokens[0]

	# === EXCEPTIONS: Roots where default is variant[1], not variant[0] ===
	# These have multiple forms but the common surface form is the second one
	DEFAULT_VARIANT_1_ROOTS = {
	2227, # üçlü (not üçle)
	2209, # yaşı (special handling below)
	}

	# Special case: üçlü - always return üçlü (variant 1) unless specific context
	if token_id == 2227:
	return tokens[1] if len(tokens) > 1 else tokens[0]

	# Akış (aka/akı) Exception (2199) - Default to "akı" (variant 1)
	# "aka" is only used in specific contexts like "akacak"
	if token_id == 2199:
	if i < len(ids) - 1:
	next_str = self.reverse_dict[ids[i + 1]][0]
	# Use "aka" only when followed by vowel-starting suffixes like "acak"
	if next_str.startswith("a") or next_str.startswith("e"):
	return tokens[0] # "aka" for "akacak"
	# Default to "akı" for all other cases
	return tokens[1] if len(tokens) > 1 else tokens[0]

	# Ata/Atı Exception (2212) - for "atılırsa", "atılmak", "atıyorlar" etc.
	# Use "atı" (variant 1) when followed by 'l' (passive) or 'y' (yor, yacak)
	if token_id == 2212:
	if len(tokens) > 1 and i < len(ids) - 1:
	next_str = self.reverse_dict[ids[i + 1]][0]
	if next_str.strip().startswith("l") or next_str.strip().startswith("y"):
	return tokens[
	1
	] # "atı" + "lırsa" = "atılırsa", "atı" + "yorlar" = "atıyorlar"
	return tokens[0] # "ata" by default

	# Special case: yaşı/yaşa - return yaşı before 'na' suffix
	if token_id == 2209:
	if i < len(ids) - 1:
	# 20188 = 'na'
	if ids[i + 1] == 20188:
	return tokens[1] if len(tokens) > 1 else tokens[0]
	return tokens[0]

	# Alın (alın/aln) Exception (182) - Default to "alın" (variant 0)
	# Only use "aln" when followed by possessive vowel suffix
	if token_id == 182:
	if i < len(ids) - 1:
	next_id = ids[i + 1]
	# Only drop vowel for simple possessive suffixes
	# 20034 = 'ı', 20033 = 'i', 20035 = 'u', 20036 = 'ü'
	if next_id in (20034, 20033, 20035, 20036):
	return tokens[1] if len(tokens) > 1 else tokens[0] # "aln" + ı
	# Keep "alın" for all other cases
	return tokens[0]

	# Ilim/Ilm Exception (166) - Default to "ilim" (variant 0)
	# Only use "ilm" when followed by single-vowel possessive suffix
	if token_id == 166:
	if len(tokens) > 1 and i < len(ids) - 1:
	next_id = ids[i + 1]
	# Only use "ilm" for possessive/buffer case (ilmi, ilme)
	if next_id in (20033, 20038): # 'i', 'e'
	return tokens[1] # "ilm" + i = "ilmi"
	return tokens[0] # Default to "ilim"

	# Boya/Boyu Exception (2220) - "boya" (paint) vs "boyu" (height)
	# Use "boyu" (variant 1) by default
	# Use "boya" only for paint-related suffix patterns (boyanan, boyamak, boyalı, etc.)
	if token_id == 2220:
	if len(tokens) > 1 and i < len(ids) - 1:
	next_id = ids[i + 1]
	next_str = self.reverse_dict[next_id][0]
	# Use "boya" only when followed by actual suffix tokens starting with 'n', 'm', 'l', 'd'
	# (boyanan, boyamak, boyalı, boyadan) - these are paint-related contexts
	if (
	next_id >= 20000
	and next_str.strip()
	and next_str.strip()[0] in "nmld"
	):
	return tokens[0] # "boya"
	return tokens[1] if len(tokens) > 1 else tokens[0] # "boyu" by default

	# Bile/Bili Exception (2307) - for "bilir", "biliyor" vs "biler", "beleyor"
	# Use "bili" (variant 1) when followed by 'r' or 'yor'
	if token_id == 2307:
	if len(tokens) > 1 and i < len(ids) - 1:
	next_str = self.reverse_dict[ids[i + 1]][0]
	if next_str.strip().startswith("r") or next_str.strip() == "yor":
	return tokens[
	1
	] # "bili" + "r" = "bilir", "bili" + "yor" = "biliyor"
	return tokens[0] # Default to "bile"

	# Ada/Adı Exception (2218) - Default to "adı" (variant 1)
	if token_id == 2218:
	if i < len(ids) - 1:
	next_id = ids[i + 1]
	next_str = self.reverse_dict[next_id][0]
	# Use "ada" when followed by 'n' suffixes or 'yı' (for adayı pattern) or 'ma' (adama)
	# 20017 = suffix yı, 32725 = BPE yı, 20002 = ma/me
	if (
	next_id == 20040
	or next_str.startswith("n")
	or next_id in (20017, 32725, 20002, 32763)
	):
	return tokens[0] # "ada" for "adanın", "adayı", "adama"
	# Default to "adı" for most cases
	return tokens[1] if len(tokens) > 1 else tokens[0]

	# Kap/Kab Exception (336) - favor "kapı" (door) over "kab" (container) context
	# "kapımızı" (our door) tokenizes as kap + ımız + ı -> default softens to kabımızı
	# So we prevent softening for 336 in potential door contexts
	if token_id == 336:
	if len(tokens) > 1 and i < len(ids) - 1:
	next_str = self.reverse_dict[ids[i + 1]][0]
	# If followed by vowel (which causes softening default), check if it looks like possessive plural
	# kap + ımız -> kapımız (door) vs kabımız (container)
	# We prioritize "door" (kap) as it's more common
	if self._starts_with_vowel(next_str):
	return tokens[0] # Keep "kap"
	return tokens[0] # Default "kap"

	# Emekli/Emekle Exception (2295) - Default to "emekli" (variant 1)
	if token_id == 2295:
	if i < len(ids) - 1:
	next_id = ids[i + 1]
	# 20041 = 'yor' - for "emekliyor" use base form
	if next_id == 20041:
	return tokens[0] # "emekle" + yor = emekliyor
	# Default to "emekli"
	return tokens[1] if len(tokens) > 1 else tokens[0]

	# Tutuk/Tutuğ/Tutk Exception (107) - for "tutkun" (fan/devotee)
	# Use "tutk" (variant 2) when followed by suffix starting with 'u' (for un/unlar etc.)
	# Otherwise use default "tutuk" (don't soften to tutuğ)
	if token_id == 107:
	if len(tokens) > 2 and i < len(ids) - 1:
	next_str = self.reverse_dict[ids[i + 1]][0]
	# Check if next token starts with 'u' (un, unlar, etc.)
	if next_str.strip().startswith("u"):
	return tokens[2] # "tutk" + "un" = "tutkun"
	# For other cases, use default form (tutuk), not softened (tutuğ)
	return tokens[0]

	# Başla/Başlı Exception (2206) - for "başlıca" (primary/mainly)
	# Use "başlı" (variant 1) when followed by 'ca/ce' suffix
	if token_id == 2206:
	if len(tokens) > 1 and i < len(ids) - 1:
	next_id = ids[i + 1]
	# 20005 = 'ça/çe' suffix, 20047 = 'ce', 20207 = BPE 'ca'
	if next_id in (20005, 20047, 20207):
	return tokens[1] # "başlı" + "ca" = "başlıca"
	# Continue to existing logic below

	# Dip/Dib Exception (2406) - soften to "dib" before vowel suffixes
	# (dibinde, dibini, etc.) Token 2406 is outside the 100-2080 range
	if token_id == 2406:
	if len(tokens) > 1 and i < len(ids) - 1:
	next_str = self.reverse_dict[ids[i + 1]][0]
	if self._starts_with_vowel(next_str.strip()):
	return tokens[1] # "dib" + "inde" = "dibinde"
	return tokens[0] # "dip" by default
	if token_id in [19531, 19968]: # de, ye
	# Special handling for de/ye narrowing
	# de -> di, ye -> yi when followed by yor or variable suffixes starting with vowel (which get 'y' buffer)
	should_narrow = False

	if next_token.strip() == "yor":
	should_narrow = True
	elif ids[i + 1] in self.reverse_dict:
	# Check if next suffix starts with vowel, invoking 'y' buffer
	# e.g. acak/ecek -> yacak/yecek
	suff_forms = self.reverse_dict[ids[i + 1]]
	if suff_forms and any(
	s.startswith(("a", "e", "ı", "i", "u", "ü", "o", "ö"))
	for s in suff_forms
	):
	should_narrow = True

	if should_narrow:
	# Replace last char e -> i
	# Handle space prefix
	original = tokens[0]
	if original.endswith("e"):
	return original[:-1] + "i"
	elif original.endswith("E"):
	return original[:-1] + "İ"
	return tokens[0]

	if 100 <= token_id < 2080:
	# Skip softening for roots in NO_SOFTENING_ROOTS (already handled above)
	if self._starts_with_vowel(next_token):
	return tokens[1]
	elif token_id <= 110 and next_token.strip() == "ı":
	return tokens[2]
	else:
	return tokens[0]

	elif 2080 <= token_id < 2315:
	if next_token.strip() == "yor":
	return tokens[1]
	else:
	return tokens[0]

	return tokens[0]

	def decode(self, ids: List[int]) -> str:
	"""Decode a list of token IDs to text."""
	if not ids:
	return ""

	text_parts = []
	i = 0

	while i < len(ids):
	token_id = ids[i]
	# Handle special tokens
	if token_id == 0 and i < len(ids) - 1: # uppercase
	next_token = self._select_correct_root(i + 1, ids)
	if next_token.startswith(" "):
	text_parts.append(" " + self._tr_capitalize(next_token.lstrip()))
	else:
	text_parts.append(self._tr_capitalize(next_token))
	i += 2
	continue
	elif token_id == 1: # unknown
	text_parts.append("▁u▁")
	elif token_id in self.reverse_dict:
	tokens = self.reverse_dict[token_id]
	if len(tokens) > 1:
	if token_id < 20000: # root token
	text_parts.append(self._select_correct_root(i, ids))
	else: # suffix token
	# Find context from previous tokens
	# We need enough context for both vowel harmony (looking back past consonants)
	# and consonant harmony (immediate previous char)
	prev_token = ""
	j = len(text_parts) - 1
	tokens_found = 0

	# Look back up to 3 tokens or until we have enough context
	temp_context = []
	while j >= 0 and tokens_found < 3:
	part = text_parts[j]
	temp_context.insert(0, part)
	if any(c.isalpha() for c in part):
	tokens_found += 1
	j -= 1

	if temp_context:
	prev_token = "".join(temp_context)

	text_parts.append(
	self._select_correct_suffix(i, ids, prev_token)
	)
	else:
	text_parts.append(tokens[0])
	else:
	text_parts.append("▁")

	i += 1

	return "".join(text_parts)


	class TokenType(Enum):
	ROOT = "ROOT"
	SUFFIX = "SUFFIX"
	BPE = "BPE"


	class TurkishTokenizer:
	def __init__(self, kokler_file=None, ekler_file=None, bpe_file=None):
	# Get the directory where this module is located
	package_dir = os.path.dirname(os.path.abspath(__file__))

	# Use provided files or fall back to package resource paths
	if kokler_file is None:
	kokler_file = os.path.join(package_dir, "vocabs/kokler.json")
	if ekler_file is None:
	ekler_file = os.path.join(package_dir, "vocabs/ekler.json")
	if bpe_file is None:
	bpe_file = os.path.join(package_dir, "vocabs/bpe_tokenler.json")

	# Load JSON files
	with open(kokler_file, "r", encoding="utf-8") as f:
	roots = json.load(f)
	with open(ekler_file, "r", encoding="utf-8") as f:
	suffixes = json.load(f)
	with open(bpe_file, "r", encoding="utf-8") as f:
	bpe_tokens = json.load(f)

	# Store the dictionaries as instance attributes
	self.roots = roots
	self.suffixes = suffixes
	self.bpe_tokens = bpe_tokens

	# Now create vocab and reverse dict
	self.vocab = self.get_vocab()
	self.reverse_dict = {}

	# Helper to populate reverse dict
	def add_to_reverse(source_dict):
	for key, value in source_dict.items():
	if value not in self.reverse_dict:
	self.reverse_dict[value] = []
	# Avoid duplicates
	if key not in self.reverse_dict[value]:
	self.reverse_dict[value].append(key)

	add_to_reverse(self.roots)
	add_to_reverse(self.suffixes)
	add_to_reverse(self.bpe_tokens)

	self.decoder = TurkishDecoder(self.reverse_dict)

	self.vocab_size = len(self.reverse_dict)

	self.max_root_len = max(len(k) for k in roots) if roots else 0
	self.max_suffix_len = max(len(k) for k in suffixes) if suffixes else 0
	self.max_bpe_len = max(len(k) for k in bpe_tokens) if bpe_tokens else 0

	# Filter BPE tokens: remove any that also exist in suffixes
	# This guarantees suffix priority (matching Rust behavior)
	self.bpe_tokens_filtered = {
	k: v for k, v in bpe_tokens.items() if k not in suffixes
	}

	self.uppercase_marker = {
	"token": "<uppercase>",
	"id": roots["<uppercase>"],
	"type": TokenType.ROOT,
	}
	self.unknown_marker = {
	"token": "<unknown>",
	"id": roots["<unknown>"],
	"type": TokenType.ROOT,
	}
	self.space_marker = {"token": " ", "id": roots[" "], "type": TokenType.ROOT}

	# added to be compatible with SFTTrainer
	self.pad_token = "<pad>"
	self.eos_token = "<eos>"
	self.pad_token_id = roots[self.pad_token]
	self.eos_token_id = roots[self.eos_token]

	# added to be compatible with SFTTrainer
	def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
	return [self.vocab[token] for token in tokens]

	# added to be compatible with SFTTrainer
	def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
	# CORRECTED: Return the first variant string, not the list of variants
	return [self.reverse_dict[id][0] for id in ids]

	def get_vocab(self) -> Dict[str, int]:
	# Order matters: suffixes AFTER bpe_tokens so suffixes win on conflicts
	return {self.roots, self.bpe_tokens, **self.suffixes}

	def _tokenize_word(self, word: str) -> Tuple[List[dict], List[int]]:
	uppercase_indices = [i for i, c in enumerate(word) if c.isupper()]
	result = []

	segments = self._camel_split_with_positions(word)

	for seg, orig_pos in segments:
	if orig_pos < len(word) and word[orig_pos].isupper():
	result.append(self.uppercase_marker)

	# Only prepend space if at start and not whitespace
	should_add_space = orig_pos == 0 and not seg.isspace()

	if should_add_space:
	seg = " " + seg

	s = self._tr_lower(seg)
	pos = 0

	while pos < len(s):
	substr = s[pos:]

	r_matches = self._all_prefix_matches(
	substr, self.roots, self.max_root_len
	)
	b_matches = self._all_prefix_matches(
	substr, self.bpe_tokens_filtered, self.max_bpe_len
	)
	s_matches = self._all_prefix_matches(
	substr, self.suffixes, self.max_suffix_len
	)

	candidates = []
	for r_id, r_tok in r_matches:
	candidates.append(("ROOT", r_tok, r_id, len(r_tok), TokenType.ROOT))
	for b_id, b_tok in b_matches:
	candidates.append(("BPE", b_tok, b_id, len(b_tok), TokenType.BPE))
	for s_id, s_tok in s_matches:
	candidates.append(
	("SUFFIX", s_tok, s_id, len(s_tok), TokenType.SUFFIX)
	)

	if not candidates:
	result.append(self.unknown_marker)
	pos += 1
	continue

	best_candidate = None
	best_score = -1

	for c_type, c_tok, c_id, c_len, c_enum in candidates:
	score = c_len
	remainder = substr[c_len:]

	if not remainder:
	# Full match bonus
	score += 5
	else:
	# Follow-up suffix bonus
	s_next_id, s_next_tok = self._longest_prefix_lookup(
	remainder, self.suffixes, self.max_suffix_len
	)
	if s_next_id is not None:
	# Ignore 1-char variants to prefer atomic roots (e.g. Kapı vs Kap+ı)
	if len(s_next_tok) > 1:
	score += len(s_next_tok)

	if score > best_score:
	best_score = score
	best_candidate = (c_tok, c_id, c_enum)
	elif score == best_score:
	# Tie-break Priority: Root > Suffix > BPE
	if (
	c_enum == TokenType.ROOT
	and best_candidate[2] != TokenType.ROOT
	):
	best_candidate = (c_tok, c_id, c_enum)
	elif (
	c_enum == TokenType.SUFFIX
	and best_candidate[2] == TokenType.BPE
	):
	best_candidate = (c_tok, c_id, c_enum)

	result.append(
	{
	"token": best_candidate[0],
	"id": best_candidate[1],
	"type": best_candidate[2],
	}
	)
	pos += len(best_candidate[0])
	continue

	result.append(self.unknown_marker)
	pos += 1

	return result, uppercase_indices

	def tokenize_text(self, text: str) -> Tuple[List[dict], List[int]]:
	final_tokens = []
	uppercase_indices = [i for i, c in enumerate(text) if c.isupper()]

	parts = text.split(" ")
	for idx, part in enumerate(parts):
	part = part.strip()
	# part = " " + part
	if part.strip():
	tokens, _ = self._tokenize_word(part)

	cleaned_tokens = []
	for i, token in enumerate(tokens):

	if (
	i >= 2
	and not (0 <= token["id"] <= 19999)
	and tokens[i - 2] == self.uppercase_marker
	and tokens[i - 1] == self.space_marker
	):
	cleaned_tokens.pop(-1)

	# If this token is uppercase_marker, check previous token
	if (
	token == self.uppercase_marker
	and len(cleaned_tokens) > 0
	and cleaned_tokens[-1] == self.space_marker
	):
	should_pop = False
	if i + 1 < len(tokens):
	next_tok_str = tokens[i + 1]["token"]
	if next_tok_str.startswith(" "):
	should_pop = True

	if should_pop:
	cleaned_tokens.pop() # remove the last " " before uppercase
	cleaned_tokens.append(token)

	final_tokens.extend(cleaned_tokens)

	return final_tokens, uppercase_indices

	def encode(self, text: str) -> List[int]:
	tokens, _ = self.tokenize_text(text)
	return [t["id"] for t in tokens]

	def tokenize(self, text: str) -> List[str]:
	tokens, _ = self.tokenize_text(text)
	return [t["token"] for t in tokens]

	def _longest_prefix_lookup(
	self, s: str, table: Dict[str, int], max_len: int = None
	) -> Tuple[Optional[int], str]:
	end = min(len(s), max_len) if max_len else len(s)
	for i in range(end, 0, -1):
	cand = s[:i]
	if cand in table:
	return table[cand], cand
	return None, ""

	def _all_prefix_matches(
	self, s: str, table: Dict[str, int], max_len: int = None
	) -> List[Tuple[int, str]]:
	matches = []
	end = min(len(s), max_len) if max_len else len(s)
	for i in range(end, 0, -1):
	prefix = s[:i]
	if prefix in table:
	matches.append((table[prefix], prefix))
	return matches

	def _tr_lower(self, word: str) -> str:
	if "I" in word or "İ" in word:
	word = word.replace("İ", "i").replace("I", "ı")
	return word.lower()

	def _camel_split_with_positions(self, word: str) -> List[Tuple[str, int]]:
	if not word:
	return []

	parts = []
	start = 0

	for i in range(1, len(word)):
	if word[i].isupper():
	if start < i:
	parts.append((self._tr_lower(word[start:i]), start))
	start = i

	if start < len(word):
	parts.append((self._tr_lower(word[start:]), start))

	return parts if parts else [(self._tr_lower(word), 0)]

	def decode(self, ids: List[int]) -> str:
	return self.decoder.decode(ids)

	# added to be compatible with SFTTrainer
	def __call__(self, text: str) -> Dict[str, List[int]]:
	input_ids = self.encode(text)
	attention_mask = [1 for _ in input_ids]
	return {"input_ids": input_ids, "attention_mask": attention_mask}


	class TurkishMFTTokenizerHF(PreTrainedTokenizer):
	"""
	Hugging Face Transformers capable wrapper.
	The main tokenization logic is in TurkishTokenizer.
	"""

	model_input_names = ["input_ids", "attention_mask"]

	vocab_files_names = {
	"kokler_file": "vocabs/kokler.json",
	"ekler_file": "vocabs/ekler.json",
	"bpe_file": "vocabs/bpe_tokenler.json",
	}

	def __init__(
	self,
	kokler_file=None,
	ekler_file=None,
	bpe_file=None,
	**kwargs,
	):
	# Initialize the actual tokenizer
	self._tok = TurkishTokenizer(
	kokler_file=kokler_file, ekler_file=ekler_file, bpe_file=bpe_file
	)

	# HF sometimes requires vocab_size
	self._vocab = self._tok.get_vocab()

	# IMPORTANT: PreTrainedTokenizer manages special tokens
	# Your vocab already has <pad>, <eos>.
	super().__init__(
	pad_token=kwargs.pop("pad_token", "<pad>"),
	eos_token=kwargs.pop("eos_token", "<eos>"),
	unk_token=kwargs.pop("unk_token", "<unknown>"),
	**kwargs,
	)

	# Ids
	self.pad_token_id = self._tok.pad_token_id
	self.eos_token_id = self._tok.eos_token_id
	self.unk_token_id = self._tok.roots.get("<unknown>", 1)

	@property
	def vocab_size(self) -> int:
	return len(self._tok.reverse_dict)

	def get_vocab(self) -> Dict[str, int]:
	return dict(self._vocab)

	# HF's base function to call
	def _tokenize(self, text: str) -> List[str]:
	# You can modify behavior here: normalize, replace, etc.
	# e.g.: text = text.replace("naber", "ne haber")
	return self._tok.tokenize(text)

	def _convert_token_to_id(self, token: str) -> int:
	return self._vocab.get(token, self.unk_token_id)

	def _convert_id_to_token(self, index: int) -> str:
	# reverse_dict: id -> [variants]
	if index in self._tok.reverse_dict:
	return self._tok.reverse_dict[index][0]
	return self.unk_token

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	# Your decode uses ids; let's not just join simply.
	# HF uses this function sometimes. You can do tokens->ids->decode.
	ids = [self._convert_token_to_id(t) for t in tokens]
	return self._tok.decode(ids)

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	# You can add BOS/EOS here if you want.
	if token_ids_1 is None:
	return token_ids_0
	return token_ids_0 + token_ids_1

	def save_vocabulary(
	self, save_directory: str, filename_prefix: Optional[str] = None
	) -> Tuple[str, ...]:
	# Create vocabs subdirectory
	vocabs_dir = os.path.join(save_directory, "vocabs")
	os.makedirs(vocabs_dir, exist_ok=True)

	prefix = filename_prefix or ""

	# Save all three vocabulary files
	kokler_path = os.path.join(vocabs_dir, f"{prefix}kokler.json")
	ekler_path = os.path.join(vocabs_dir, f"{prefix}ekler.json")
	bpe_path = os.path.join(vocabs_dir, f"{prefix}bpe_tokenler.json")

	with open(kokler_path, "w", encoding="utf-8") as f:
	json.dump(self._tok.roots, f, ensure_ascii=False, indent=2)

	with open(ekler_path, "w", encoding="utf-8") as f:
	json.dump(self._tok.suffixes, f, ensure_ascii=False, indent=2)

	with open(bpe_path, "w", encoding="utf-8") as f:
	json.dump(self._tok.bpe_tokens, f, ensure_ascii=False, indent=2)

	return (kokler_path, ekler_path, bpe_path)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, args, *kwargs):
	# default behavior is sufficient
	return super().from_pretrained(pretrained_model_name_or_path, args, *kwargs)