import json
import os
from enum import Enum
from typing import Dict, List, Optional, Tuple, Union

from transformers import PreTrainedTokenizer


__version__ = "1.0.3"  # Fixed vocab merge order


class TurkishDecoder:
    # Define vowel sets as class constants for better performance
    ALL_VOWELS = "aeıioöuüâ"
    INCE_VOWELS = "eiöü"  # Front vowels
    AI_VOWELS = "aıâ"  # Back unrounded
    EI_VOWELS = "ei"  # Front unrounded
    OU_VOWELS = "ou"  # Back rounded
    HARD_CONSONANTS = "fstkçşhp"  # Sert ünsüzler
    WHITESPACE = " \n\t"

    def __init__(self, reverse_dict):
        self.reverse_dict = reverse_dict

    def _tr_capitalize(self, word: str) -> str:
        """Capitalize using Turkish casing rules (i -> İ, ı -> I)."""
        if not word:
            return ""
        if word.startswith("i"):
            return "İ" + word[1:]
        return word.capitalize()

    def _starts_with_vowel(self, word: str) -> bool:
        """Check if word starts with a vowel."""
        return bool(word and word[0] in self.ALL_VOWELS)

    def _ends_with_vowel(self, word: str) -> bool:
        """Check if word ends with a vowel."""
        return bool(word and word[-1] in self.ALL_VOWELS)

    def _ends_with_any(self, word: str, charset: str) -> bool:
        # recursively check until first vowel starts from the end
        i = len(word) - 1
        while i >= 0:
            if word[i] in charset:
                return True
            if word[i] in self.ALL_VOWELS:
                return False
            i -= 1
        return False

    def _ends_with_ince(self, word: str) -> bool:
        """Check if word ends with front vowels (ince ünlü)."""
        if word in ("saat", "kilovatsaat", "ziraat", "itaat", "istikbal"):
            return True
        # check until first vowel recursively
        return self._ends_with_any(word, self.INCE_VOWELS)

    def _ends_with_sert_unsuz(self, word: str) -> bool:
        """Check if word ends with a hard consonant."""
        return bool(word and word[-1] in self.HARD_CONSONANTS)

    def _get_vowel_suffix_index(self, prev_token: str) -> int:
        """Get suffix index based on vowel harmony rules."""
        if self._ends_with_any(prev_token, self.AI_VOWELS):
            return 0
        elif self._ends_with_any(prev_token, self.EI_VOWELS):
            return 1
        elif self._ends_with_any(prev_token, self.OU_VOWELS):
            return 2
        return 3

    def _select_correct_suffix(self, i: int, ids: List[int], prev_token: str) -> str:
        """Select the correct suffix based on morphological rules."""
        suffixes = self.reverse_dict[ids[i]]
        token_id = ids[i]
        # Handle different suffix types with cleaner logic
        if token_id < 20013:
            # Basic suffix selection based on vowel harmony
            return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]

        elif token_id < 20023:  # nın, nin, nun, nün
            return suffixes[self._get_vowel_suffix_index(prev_token)]

        elif token_id == 20023:  # la, le, yla, yle
            end_of_word = True
            if i < len(ids) - 1:
                next_token = self.reverse_dict[ids[i + 1]][0]
                if next_token not in self.WHITESPACE:
                    end_of_word = False
            return self._handle_la_le_suffix(prev_token, suffixes, end_of_word)

        elif token_id <= 20025:  # da, de, ta, te, dan, den, tan, ten
            return self._handle_da_de_suffix(prev_token, suffixes)

        elif 20025 < token_id < 20029:  # dı, di, du, dü, tı, ti, tu, tü, etc.
            return self._handle_di_du_suffix(prev_token, suffixes)

        elif token_id == 20029:  # lık, lik, luk, lük, etc.
            return self._handle_lik_suffix(i, ids, prev_token, suffixes)

        elif token_id == 20030:  # cık, cik, cuk, cük, etc.
            return self._handle_cik_suffix(i, ids, prev_token, suffixes)

        elif token_id == 20031:  # mak, mek, may, mey
            return self._handle_mak_suffix(i, ids, prev_token, suffixes)

        elif token_id == 20032:  # acak, ecek, etc.
            return self._handle_acak_suffix(i, ids, prev_token, suffixes)

        return suffixes[0]

    def _handle_la_le_suffix(
        self, prev_token: str, suffixes: List[str], end_of_word: bool
    ) -> str:
        """Handle la/le/yla/yle suffix selection."""
        if self._ends_with_vowel(prev_token) and end_of_word:
            return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
        else:
            return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]

    def _handle_da_de_suffix(self, prev_token: str, suffixes: List[str]) -> str:
        """Handle da/de/ta/te suffix selection."""
        if self._ends_with_sert_unsuz(prev_token):
            return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2]
        return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0]

    def _handle_di_du_suffix(self, prev_token: str, suffixes: List[str]) -> str:
        """Handle dı/di/du/dü suffix selection."""
        base_index = self._get_vowel_suffix_index(prev_token)
        return (
            suffixes[base_index + 4]
            if self._ends_with_sert_unsuz(prev_token)
            else suffixes[base_index]
        )

    def _handle_lik_suffix(
        self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
    ) -> str:
        """Handle lık/lik/luk/lük suffix selection."""
        if i >= len(ids) - 1:
            return suffixes[0]

        next_token = self.reverse_dict[ids[i + 1]][0]
        base_index = self._get_vowel_suffix_index(prev_token)
        return (
            suffixes[base_index + 4]
            if self._starts_with_vowel(next_token)
            else suffixes[base_index]
        )

    def _handle_cik_suffix(
        self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
    ) -> str:
        """Handle cık/cik/cuk/cük suffix selection."""
        if i >= len(ids) - 1:
            return suffixes[0]

        next_token = self.reverse_dict[ids[i + 1]][0]
        base_index = self._get_vowel_suffix_index(prev_token)

        if self._starts_with_vowel(next_token):
            offset = 12 if self._ends_with_sert_unsuz(prev_token) else 8
        else:
            offset = 4 if self._ends_with_sert_unsuz(prev_token) else 0

        return suffixes[base_index + offset]

    def _handle_mak_suffix(
        self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
    ) -> str:
        """Handle mak/mek/may/mey suffix selection."""
        if i >= len(ids) - 1:
            return suffixes[0]

        next_token = self.reverse_dict[ids[i + 1]][0]
        base_index = 1 if self._ends_with_ince(prev_token) else 0
        return (
            suffixes[base_index + 2]
            if self._starts_with_vowel(next_token)
            else suffixes[base_index]
        )

    def _handle_acak_suffix(
        self, i: int, ids: List[int], prev_token: str, suffixes: List[str]
    ) -> str:
        """Handle acak/ecek/yacak/yecek suffix selection."""
        is_vowel_ending = self._ends_with_vowel(prev_token)
        is_ince = self._ends_with_ince(prev_token)

        is_vowel_starting = False
        if i < len(ids) - 1:
            next_token = self.reverse_dict[ids[i + 1]][0]
            is_vowel_starting = self._starts_with_vowel(next_token)

        if is_vowel_starting:
            if is_vowel_ending:
                return suffixes[7] if is_ince else suffixes[6]
            else:
                return suffixes[3] if is_ince else suffixes[2]
        else:
            if is_vowel_ending:
                return suffixes[5] if is_ince else suffixes[4]
            else:
                return suffixes[1] if is_ince else suffixes[0]

    def _select_correct_root(self, i: int, ids: List[int]) -> str:
        """Select the correct root form based on morphological context."""
        token_id = ids[i]
        tokens = self.reverse_dict[token_id]

        if i > len(ids) - 2:
            return tokens[0]

        next_token = self.reverse_dict[ids[i + 1]][0]

        # === EXCEPTIONS: Roots that should NOT soften ===
        # These roots end in consonants that look like they should soften
        # but actually stay unchanged before vowel-initial suffixes
        NO_SOFTENING_ROOTS = {
            204,  # hayat - hayatı (not hayatı -> hayadi)
            220,  # belirt - belirten (not belirden)
            298,  # meslek - mesleki (not mesleği)
        }
        if token_id in NO_SOFTENING_ROOTS:
            return tokens[0]

        # === EXCEPTIONS: Roots where default is variant[1], not variant[0] ===
        # These have multiple forms but the common surface form is the second one
        DEFAULT_VARIANT_1_ROOTS = {
            2227,  # üçlü (not üçle)
            2209,  # yaşı (special handling below)
        }

        # Special case: üçlü - always return üçlü (variant 1) unless specific context
        if token_id == 2227:
            return tokens[1] if len(tokens) > 1 else tokens[0]

        # Akış (aka/akı) Exception (2199) - Default to "akı" (variant 1)
        # "aka" is only used in specific contexts like "akacak"
        if token_id == 2199:
            if i < len(ids) - 1:
                next_str = self.reverse_dict[ids[i + 1]][0]
                # Use "aka" only when followed by vowel-starting suffixes like "acak"
                if next_str.startswith("a") or next_str.startswith("e"):
                    return tokens[0]  # "aka" for "akacak"
            # Default to "akı" for all other cases
            return tokens[1] if len(tokens) > 1 else tokens[0]

        # Ata/Atı Exception (2212) - for "atılırsa", "atılmak", "atıyorlar" etc.
        # Use "atı" (variant 1) when followed by 'l' (passive) or 'y' (yor, yacak)
        if token_id == 2212:
            if len(tokens) > 1 and i < len(ids) - 1:
                next_str = self.reverse_dict[ids[i + 1]][0]
                if next_str.strip().startswith("l") or next_str.strip().startswith("y"):
                    return tokens[
                        1
                    ]  # "atı" + "lırsa" = "atılırsa", "atı" + "yorlar" = "atıyorlar"
            return tokens[0]  # "ata" by default

        # Special case: yaşı/yaşa - return yaşı before 'na' suffix
        if token_id == 2209:
            if i < len(ids) - 1:
                # 20188 = 'na'
                if ids[i + 1] == 20188:
                    return tokens[1] if len(tokens) > 1 else tokens[0]
            return tokens[0]

        # Alın (alın/aln) Exception (182) - Default to "alın" (variant 0)
        # Only use "aln" when followed by possessive vowel suffix
        if token_id == 182:
            if i < len(ids) - 1:
                next_id = ids[i + 1]
                # Only drop vowel for simple possessive suffixes
                # 20034 = 'ı', 20033 = 'i', 20035 = 'u', 20036 = 'ü'
                if next_id in (20034, 20033, 20035, 20036):
                    return tokens[1] if len(tokens) > 1 else tokens[0]  # "aln" + ı
            # Keep "alın" for all other cases
            return tokens[0]

        # Ilim/Ilm Exception (166) - Default to "ilim" (variant 0)
        # Only use "ilm" when followed by single-vowel possessive suffix
        if token_id == 166:
            if len(tokens) > 1 and i < len(ids) - 1:
                next_id = ids[i + 1]
                # Only use "ilm" for possessive/buffer case (ilmi, ilme)
                if next_id in (20033, 20038):  # 'i', 'e'
                    return tokens[1]  # "ilm" + i = "ilmi"
            return tokens[0]  # Default to "ilim"

        # Boya/Boyu Exception (2220) - "boya" (paint) vs "boyu" (height)
        # Use "boyu" (variant 1) by default
        # Use "boya" only for paint-related suffix patterns (boyanan, boyamak, boyalı, etc.)
        if token_id == 2220:
            if len(tokens) > 1 and i < len(ids) - 1:
                next_id = ids[i + 1]
                next_str = self.reverse_dict[next_id][0]
                # Use "boya" only when followed by actual suffix tokens starting with 'n', 'm', 'l', 'd'
                # (boyanan, boyamak, boyalı, boyadan) - these are paint-related contexts
                if (
                    next_id >= 20000
                    and next_str.strip()
                    and next_str.strip()[0] in "nmld"
                ):
                    return tokens[0]  # "boya"
            return tokens[1] if len(tokens) > 1 else tokens[0]  # "boyu" by default

        # Bile/Bili Exception (2307) - for "bilir", "biliyor" vs "biler", "beleyor"
        # Use "bili" (variant 1) when followed by 'r' or 'yor'
        if token_id == 2307:
            if len(tokens) > 1 and i < len(ids) - 1:
                next_str = self.reverse_dict[ids[i + 1]][0]
                if next_str.strip().startswith("r") or next_str.strip() == "yor":
                    return tokens[
                        1
                    ]  # "bili" + "r" = "bilir", "bili" + "yor" = "biliyor"
            return tokens[0]  # Default to "bile"

        # Ada/Adı Exception (2218) - Default to "adı" (variant 1)
        if token_id == 2218:
            if i < len(ids) - 1:
                next_id = ids[i + 1]
                next_str = self.reverse_dict[next_id][0]
                # Use "ada" when followed by 'n' suffixes or 'yı' (for adayı pattern) or 'ma' (adama)
                # 20017 = suffix yı, 32725 = BPE yı, 20002 = ma/me
                if (
                    next_id == 20040
                    or next_str.startswith("n")
                    or next_id in (20017, 32725, 20002, 32763)
                ):
                    return tokens[0]  # "ada" for "adanın", "adayı", "adama"
            # Default to "adı" for most cases
            return tokens[1] if len(tokens) > 1 else tokens[0]

        # Kap/Kab Exception (336) - favor "kapı" (door) over "kab" (container) context
        # "kapımızı" (our door) tokenizes as kap + ımız + ı -> default softens to kabımızı
        # So we prevent softening for 336 in potential door contexts
        if token_id == 336:
            if len(tokens) > 1 and i < len(ids) - 1:
                next_str = self.reverse_dict[ids[i + 1]][0]
                # If followed by vowel (which causes softening default), check if it looks like possessive plural
                # kap + ımız -> kapımız (door) vs kabımız (container)
                # We prioritize "door" (kap) as it's more common
                if self._starts_with_vowel(next_str):
                    return tokens[0]  # Keep "kap"
            return tokens[0]  # Default "kap"

        # Emekli/Emekle Exception (2295) - Default to "emekli" (variant 1)
        if token_id == 2295:
            if i < len(ids) - 1:
                next_id = ids[i + 1]
                # 20041 = 'yor' - for "emekliyor" use base form
                if next_id == 20041:
                    return tokens[0]  # "emekle" + yor = emekliyor
            # Default to "emekli"
            return tokens[1] if len(tokens) > 1 else tokens[0]

        # Tutuk/Tutuğ/Tutk Exception (107) - for "tutkun" (fan/devotee)
        # Use "tutk" (variant 2) when followed by suffix starting with 'u' (for un/unlar etc.)
        # Otherwise use default "tutuk" (don't soften to tutuğ)
        if token_id == 107:
            if len(tokens) > 2 and i < len(ids) - 1:
                next_str = self.reverse_dict[ids[i + 1]][0]
                # Check if next token starts with 'u' (un, unlar, etc.)
                if next_str.strip().startswith("u"):
                    return tokens[2]  # "tutk" + "un" = "tutkun"
            # For other cases, use default form (tutuk), not softened (tutuğ)
            return tokens[0]

        # Başla/Başlı Exception (2206) - for "başlıca" (primary/mainly)
        # Use "başlı" (variant 1) when followed by 'ca/ce' suffix
        if token_id == 2206:
            if len(tokens) > 1 and i < len(ids) - 1:
                next_id = ids[i + 1]
                # 20005 = 'ça/çe' suffix, 20047 = 'ce', 20207 = BPE 'ca'
                if next_id in (20005, 20047, 20207):
                    return tokens[1]  # "başlı" + "ca" = "başlıca"
            # Continue to existing logic below

        # Dip/Dib Exception (2406) - soften to "dib" before vowel suffixes
        # (dibinde, dibini, etc.) Token 2406 is outside the 100-2080 range
        if token_id == 2406:
            if len(tokens) > 1 and i < len(ids) - 1:
                next_str = self.reverse_dict[ids[i + 1]][0]
                if self._starts_with_vowel(next_str.strip()):
                    return tokens[1]  # "dib" + "inde" = "dibinde"
            return tokens[0]  # "dip" by default
        if token_id in [19531, 19968]:  # de, ye
            # Special handling for de/ye narrowing
            # de -> di, ye -> yi when followed by yor or variable suffixes starting with vowel (which get 'y' buffer)
            should_narrow = False

            if next_token.strip() == "yor":
                should_narrow = True
            elif ids[i + 1] in self.reverse_dict:
                # Check if next suffix starts with vowel, invoking 'y' buffer
                # e.g. acak/ecek -> yacak/yecek
                suff_forms = self.reverse_dict[ids[i + 1]]
                if suff_forms and any(
                    s.startswith(("a", "e", "ı", "i", "u", "ü", "o", "ö"))
                    for s in suff_forms
                ):
                    should_narrow = True

            if should_narrow:
                # Replace last char e -> i
                # Handle space prefix
                original = tokens[0]
                if original.endswith("e"):
                    return original[:-1] + "i"
                elif original.endswith("E"):
                    return original[:-1] + "İ"
            return tokens[0]

        if 100 <= token_id < 2080:
            # Skip softening for roots in NO_SOFTENING_ROOTS (already handled above)
            if self._starts_with_vowel(next_token):
                return tokens[1]
            elif token_id <= 110 and next_token.strip() == "ı":
                return tokens[2]
            else:
                return tokens[0]

        elif 2080 <= token_id < 2315:
            if next_token.strip() == "yor":
                return tokens[1]
            else:
                return tokens[0]

        return tokens[0]

    def decode(self, ids: List[int]) -> str:
        """Decode a list of token IDs to text."""
        if not ids:
            return ""

        text_parts = []
        i = 0

        while i < len(ids):
            token_id = ids[i]
            # Handle special tokens
            if token_id == 0 and i < len(ids) - 1:  # uppercase
                next_token = self._select_correct_root(i + 1, ids)
                if next_token.startswith(" "):
                    text_parts.append(" " + self._tr_capitalize(next_token.lstrip()))
                else:
                    text_parts.append(self._tr_capitalize(next_token))
                i += 2
                continue
            elif token_id == 1:  # unknown
                text_parts.append("▁u▁")
            elif token_id in self.reverse_dict:
                tokens = self.reverse_dict[token_id]
                if len(tokens) > 1:
                    if token_id < 20000:  # root token
                        text_parts.append(self._select_correct_root(i, ids))
                    else:  # suffix token
                        # Find context from previous tokens
                        # We need enough context for both vowel harmony (looking back past consonants)
                        # and consonant harmony (immediate previous char)
                        prev_token = ""
                        j = len(text_parts) - 1
                        tokens_found = 0

                        # Look back up to 3 tokens or until we have enough context
                        temp_context = []
                        while j >= 0 and tokens_found < 3:
                            part = text_parts[j]
                            temp_context.insert(0, part)
                            if any(c.isalpha() for c in part):
                                tokens_found += 1
                            j -= 1

                        if temp_context:
                            prev_token = "".join(temp_context)

                        text_parts.append(
                            self._select_correct_suffix(i, ids, prev_token)
                        )
                else:
                    text_parts.append(tokens[0])
            else:
                text_parts.append("▁")

            i += 1

        return "".join(text_parts)


class TokenType(Enum):
    ROOT = "ROOT"
    SUFFIX = "SUFFIX"
    BPE = "BPE"


class TurkishTokenizer:
    def __init__(self, kokler_file=None, ekler_file=None, bpe_file=None):
        # Get the directory where this module is located
        package_dir = os.path.dirname(os.path.abspath(__file__))

        # Use provided files or fall back to package resource paths
        if kokler_file is None:
            kokler_file = os.path.join(package_dir, "vocabs/kokler.json")
        if ekler_file is None:
            ekler_file = os.path.join(package_dir, "vocabs/ekler.json")
        if bpe_file is None:
            bpe_file = os.path.join(package_dir, "vocabs/bpe_tokenler.json")

        # Load JSON files
        with open(kokler_file, "r", encoding="utf-8") as f:
            roots = json.load(f)
        with open(ekler_file, "r", encoding="utf-8") as f:
            suffixes = json.load(f)
        with open(bpe_file, "r", encoding="utf-8") as f:
            bpe_tokens = json.load(f)

        # Store the dictionaries as instance attributes
        self.roots = roots
        self.suffixes = suffixes
        self.bpe_tokens = bpe_tokens

        # Now create vocab and reverse dict
        self.vocab = self.get_vocab()
        self.reverse_dict = {}

        # Helper to populate reverse dict
        def add_to_reverse(source_dict):
            for key, value in source_dict.items():
                if value not in self.reverse_dict:
                    self.reverse_dict[value] = []
                # Avoid duplicates
                if key not in self.reverse_dict[value]:
                    self.reverse_dict[value].append(key)

        add_to_reverse(self.roots)
        add_to_reverse(self.suffixes)
        add_to_reverse(self.bpe_tokens)

        self.decoder = TurkishDecoder(self.reverse_dict)

        self.vocab_size = len(self.reverse_dict)

        self.max_root_len = max(len(k) for k in roots) if roots else 0
        self.max_suffix_len = max(len(k) for k in suffixes) if suffixes else 0
        self.max_bpe_len = max(len(k) for k in bpe_tokens) if bpe_tokens else 0

        # Filter BPE tokens: remove any that also exist in suffixes
        # This guarantees suffix priority (matching Rust behavior)
        self.bpe_tokens_filtered = {
            k: v for k, v in bpe_tokens.items() if k not in suffixes
        }

        self.uppercase_marker = {
            "token": "<uppercase>",
            "id": roots["<uppercase>"],
            "type": TokenType.ROOT,
        }
        self.unknown_marker = {
            "token": "<unknown>",
            "id": roots["<unknown>"],
            "type": TokenType.ROOT,
        }
        self.space_marker = {"token": " ", "id": roots[" "], "type": TokenType.ROOT}

        # added to be compatible with SFTTrainer
        self.pad_token = "<pad>"
        self.eos_token = "<eos>"
        self.pad_token_id = roots[self.pad_token]
        self.eos_token_id = roots[self.eos_token]

    # added to be compatible with SFTTrainer
    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
        return [self.vocab[token] for token in tokens]

    # added to be compatible with SFTTrainer
    def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
        # CORRECTED: Return the first variant string, not the list of variants
        return [self.reverse_dict[id][0] for id in ids]

    def get_vocab(self) -> Dict[str, int]:
        # Order matters: suffixes AFTER bpe_tokens so suffixes win on conflicts
        return {**self.roots, **self.bpe_tokens, **self.suffixes}

    def _tokenize_word(self, word: str) -> Tuple[List[dict], List[int]]:
        uppercase_indices = [i for i, c in enumerate(word) if c.isupper()]
        result = []

        segments = self._camel_split_with_positions(word)

        for seg, orig_pos in segments:
            if orig_pos < len(word) and word[orig_pos].isupper():
                result.append(self.uppercase_marker)

            # Only prepend space if at start and not whitespace
            should_add_space = orig_pos == 0 and not seg.isspace()

            if should_add_space:
                seg = " " + seg

            s = self._tr_lower(seg)
            pos = 0

            while pos < len(s):
                substr = s[pos:]

                r_matches = self._all_prefix_matches(
                    substr, self.roots, self.max_root_len
                )
                b_matches = self._all_prefix_matches(
                    substr, self.bpe_tokens_filtered, self.max_bpe_len
                )
                s_matches = self._all_prefix_matches(
                    substr, self.suffixes, self.max_suffix_len
                )

                candidates = []
                for r_id, r_tok in r_matches:
                    candidates.append(("ROOT", r_tok, r_id, len(r_tok), TokenType.ROOT))
                for b_id, b_tok in b_matches:
                    candidates.append(("BPE", b_tok, b_id, len(b_tok), TokenType.BPE))
                for s_id, s_tok in s_matches:
                    candidates.append(
                        ("SUFFIX", s_tok, s_id, len(s_tok), TokenType.SUFFIX)
                    )

                if not candidates:
                    result.append(self.unknown_marker)
                    pos += 1
                    continue

                best_candidate = None
                best_score = -1

                for c_type, c_tok, c_id, c_len, c_enum in candidates:
                    score = c_len
                    remainder = substr[c_len:]

                    if not remainder:
                        # Full match bonus
                        score += 5
                    else:
                        # Follow-up suffix bonus
                        s_next_id, s_next_tok = self._longest_prefix_lookup(
                            remainder, self.suffixes, self.max_suffix_len
                        )
                        if s_next_id is not None:
                            # Ignore 1-char variants to prefer atomic roots (e.g. Kapı vs Kap+ı)
                            if len(s_next_tok) > 1:
                                score += len(s_next_tok)

                    if score > best_score:
                        best_score = score
                        best_candidate = (c_tok, c_id, c_enum)
                    elif score == best_score:
                        # Tie-break Priority: Root > Suffix > BPE
                        if (
                            c_enum == TokenType.ROOT
                            and best_candidate[2] != TokenType.ROOT
                        ):
                            best_candidate = (c_tok, c_id, c_enum)
                        elif (
                            c_enum == TokenType.SUFFIX
                            and best_candidate[2] == TokenType.BPE
                        ):
                            best_candidate = (c_tok, c_id, c_enum)

                result.append(
                    {
                        "token": best_candidate[0],
                        "id": best_candidate[1],
                        "type": best_candidate[2],
                    }
                )
                pos += len(best_candidate[0])
                continue

                result.append(self.unknown_marker)
                pos += 1

        return result, uppercase_indices

    def tokenize_text(self, text: str) -> Tuple[List[dict], List[int]]:
        final_tokens = []
        uppercase_indices = [i for i, c in enumerate(text) if c.isupper()]

        parts = text.split(" ")
        for idx, part in enumerate(parts):
            part = part.strip()
            # part = " " + part
            if part.strip():
                tokens, _ = self._tokenize_word(part)

                cleaned_tokens = []
                for i, token in enumerate(tokens):

                    if (
                        i >= 2
                        and not (0 <= token["id"] <= 19999)
                        and tokens[i - 2] == self.uppercase_marker
                        and tokens[i - 1] == self.space_marker
                    ):
                        cleaned_tokens.pop(-1)

                    # If this token is uppercase_marker, check previous token
                    if (
                        token == self.uppercase_marker
                        and len(cleaned_tokens) > 0
                        and cleaned_tokens[-1] == self.space_marker
                    ):
                        should_pop = False
                        if i + 1 < len(tokens):
                            next_tok_str = tokens[i + 1]["token"]
                            if next_tok_str.startswith(" "):
                                should_pop = True

                        if should_pop:
                            cleaned_tokens.pop()  # remove the last " " before uppercase
                    cleaned_tokens.append(token)

                final_tokens.extend(cleaned_tokens)

        return final_tokens, uppercase_indices

    def encode(self, text: str) -> List[int]:
        tokens, _ = self.tokenize_text(text)
        return [t["id"] for t in tokens]

    def tokenize(self, text: str) -> List[str]:
        tokens, _ = self.tokenize_text(text)
        return [t["token"] for t in tokens]

    def _longest_prefix_lookup(
        self, s: str, table: Dict[str, int], max_len: int = None
    ) -> Tuple[Optional[int], str]:
        end = min(len(s), max_len) if max_len else len(s)
        for i in range(end, 0, -1):
            cand = s[:i]
            if cand in table:
                return table[cand], cand
        return None, ""

    def _all_prefix_matches(
        self, s: str, table: Dict[str, int], max_len: int = None
    ) -> List[Tuple[int, str]]:
        matches = []
        end = min(len(s), max_len) if max_len else len(s)
        for i in range(end, 0, -1):
            prefix = s[:i]
            if prefix in table:
                matches.append((table[prefix], prefix))
        return matches

    def _tr_lower(self, word: str) -> str:
        if "I" in word or "İ" in word:
            word = word.replace("İ", "i").replace("I", "ı")
        return word.lower()

    def _camel_split_with_positions(self, word: str) -> List[Tuple[str, int]]:
        if not word:
            return []

        parts = []
        start = 0

        for i in range(1, len(word)):
            if word[i].isupper():
                if start < i:
                    parts.append((self._tr_lower(word[start:i]), start))
                start = i

        if start < len(word):
            parts.append((self._tr_lower(word[start:]), start))

        return parts if parts else [(self._tr_lower(word), 0)]

    def decode(self, ids: List[int]) -> str:
        return self.decoder.decode(ids)

    # added to be compatible with SFTTrainer
    def __call__(self, text: str) -> Dict[str, List[int]]:
        input_ids = self.encode(text)
        attention_mask = [1 for _ in input_ids]
        return {"input_ids": input_ids, "attention_mask": attention_mask}


class TurkishMFTTokenizerHF(PreTrainedTokenizer):
    """
    Hugging Face Transformers capable wrapper.
    The main tokenization logic is in TurkishTokenizer.
    """

    model_input_names = ["input_ids", "attention_mask"]

    vocab_files_names = {
        "kokler_file": "vocabs/kokler.json",
        "ekler_file": "vocabs/ekler.json",
        "bpe_file": "vocabs/bpe_tokenler.json",
    }

    def __init__(
        self,
        kokler_file=None,
        ekler_file=None,
        bpe_file=None,
        **kwargs,
    ):
        # Initialize the actual tokenizer
        self._tok = TurkishTokenizer(
            kokler_file=kokler_file, ekler_file=ekler_file, bpe_file=bpe_file
        )

        # HF sometimes requires vocab_size
        self._vocab = self._tok.get_vocab()

        # IMPORTANT: PreTrainedTokenizer manages special tokens
        # Your vocab already has <pad>, <eos>.
        super().__init__(
            pad_token=kwargs.pop("pad_token", "<pad>"),
            eos_token=kwargs.pop("eos_token", "<eos>"),
            unk_token=kwargs.pop("unk_token", "<unknown>"),
            **kwargs,
        )

        # Ids
        self.pad_token_id = self._tok.pad_token_id
        self.eos_token_id = self._tok.eos_token_id
        self.unk_token_id = self._tok.roots.get("<unknown>", 1)

    @property
    def vocab_size(self) -> int:
        return len(self._tok.reverse_dict)

    def get_vocab(self) -> Dict[str, int]:
        return dict(self._vocab)

    # HF's base function to call
    def _tokenize(self, text: str) -> List[str]:
        # You can modify behavior here: normalize, replace, etc.
        # e.g.: text = text.replace("naber", "ne haber")
        return self._tok.tokenize(text)

    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab.get(token, self.unk_token_id)

    def _convert_id_to_token(self, index: int) -> str:
        # reverse_dict: id -> [variants]
        if index in self._tok.reverse_dict:
            return self._tok.reverse_dict[index][0]
        return self.unk_token

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        # Your decode uses ids; let's not just join simply.
        # HF uses this function sometimes. You can do tokens->ids->decode.
        ids = [self._convert_token_to_id(t) for t in tokens]
        return self._tok.decode(ids)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        # You can add BOS/EOS here if you want.
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1

    def save_vocabulary(
        self, save_directory: str, filename_prefix: Optional[str] = None
    ) -> Tuple[str, ...]:
        # Create vocabs subdirectory
        vocabs_dir = os.path.join(save_directory, "vocabs")
        os.makedirs(vocabs_dir, exist_ok=True)

        prefix = filename_prefix or ""

        # Save all three vocabulary files
        kokler_path = os.path.join(vocabs_dir, f"{prefix}kokler.json")
        ekler_path = os.path.join(vocabs_dir, f"{prefix}ekler.json")
        bpe_path = os.path.join(vocabs_dir, f"{prefix}bpe_tokenler.json")

        with open(kokler_path, "w", encoding="utf-8") as f:
            json.dump(self._tok.roots, f, ensure_ascii=False, indent=2)

        with open(ekler_path, "w", encoding="utf-8") as f:
            json.dump(self._tok.suffixes, f, ensure_ascii=False, indent=2)

        with open(bpe_path, "w", encoding="utf-8") as f:
            json.dump(self._tok.bpe_tokens, f, ensure_ascii=False, indent=2)

        return (kokler_path, ekler_path, bpe_path)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
        # default behavior is sufficient
        return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)