| import json |
| import os |
| from enum import Enum |
| from typing import Dict, List, Optional, Tuple, Union |
|
|
| from transformers import PreTrainedTokenizer |
|
|
|
|
| __version__ = "1.0.3" |
|
|
|
|
| class TurkishDecoder: |
| |
| ALL_VOWELS = "aeıioöuüâ" |
| INCE_VOWELS = "eiöü" |
| AI_VOWELS = "aıâ" |
| EI_VOWELS = "ei" |
| OU_VOWELS = "ou" |
| HARD_CONSONANTS = "fstkçşhp" |
| WHITESPACE = " \n\t" |
|
|
| def __init__(self, reverse_dict): |
| self.reverse_dict = reverse_dict |
|
|
| def _tr_capitalize(self, word: str) -> str: |
| """Capitalize using Turkish casing rules (i -> İ, ı -> I).""" |
| if not word: |
| return "" |
| if word.startswith("i"): |
| return "İ" + word[1:] |
| return word.capitalize() |
|
|
| def _starts_with_vowel(self, word: str) -> bool: |
| """Check if word starts with a vowel.""" |
| return bool(word and word[0] in self.ALL_VOWELS) |
|
|
| def _ends_with_vowel(self, word: str) -> bool: |
| """Check if word ends with a vowel.""" |
| return bool(word and word[-1] in self.ALL_VOWELS) |
|
|
| def _ends_with_any(self, word: str, charset: str) -> bool: |
| |
| i = len(word) - 1 |
| while i >= 0: |
| if word[i] in charset: |
| return True |
| if word[i] in self.ALL_VOWELS: |
| return False |
| i -= 1 |
| return False |
|
|
| def _ends_with_ince(self, word: str) -> bool: |
| """Check if word ends with front vowels (ince ünlü).""" |
| if word in ("saat", "kilovatsaat", "ziraat", "itaat", "istikbal"): |
| return True |
| |
| return self._ends_with_any(word, self.INCE_VOWELS) |
|
|
| def _ends_with_sert_unsuz(self, word: str) -> bool: |
| """Check if word ends with a hard consonant.""" |
| return bool(word and word[-1] in self.HARD_CONSONANTS) |
|
|
| def _get_vowel_suffix_index(self, prev_token: str) -> int: |
| """Get suffix index based on vowel harmony rules.""" |
| if self._ends_with_any(prev_token, self.AI_VOWELS): |
| return 0 |
| elif self._ends_with_any(prev_token, self.EI_VOWELS): |
| return 1 |
| elif self._ends_with_any(prev_token, self.OU_VOWELS): |
| return 2 |
| return 3 |
|
|
| def _select_correct_suffix(self, i: int, ids: List[int], prev_token: str) -> str: |
| """Select the correct suffix based on morphological rules.""" |
| suffixes = self.reverse_dict[ids[i]] |
| token_id = ids[i] |
| |
| if token_id < 20013: |
| |
| return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0] |
|
|
| elif token_id < 20023: |
| return suffixes[self._get_vowel_suffix_index(prev_token)] |
|
|
| elif token_id == 20023: |
| end_of_word = True |
| if i < len(ids) - 1: |
| next_token = self.reverse_dict[ids[i + 1]][0] |
| if next_token not in self.WHITESPACE: |
| end_of_word = False |
| return self._handle_la_le_suffix(prev_token, suffixes, end_of_word) |
|
|
| elif token_id <= 20025: |
| return self._handle_da_de_suffix(prev_token, suffixes) |
|
|
| elif 20025 < token_id < 20029: |
| return self._handle_di_du_suffix(prev_token, suffixes) |
|
|
| elif token_id == 20029: |
| return self._handle_lik_suffix(i, ids, prev_token, suffixes) |
|
|
| elif token_id == 20030: |
| return self._handle_cik_suffix(i, ids, prev_token, suffixes) |
|
|
| elif token_id == 20031: |
| return self._handle_mak_suffix(i, ids, prev_token, suffixes) |
|
|
| elif token_id == 20032: |
| return self._handle_acak_suffix(i, ids, prev_token, suffixes) |
|
|
| return suffixes[0] |
|
|
| def _handle_la_le_suffix( |
| self, prev_token: str, suffixes: List[str], end_of_word: bool |
| ) -> str: |
| """Handle la/le/yla/yle suffix selection.""" |
| if self._ends_with_vowel(prev_token) and end_of_word: |
| return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2] |
| else: |
| return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0] |
|
|
| def _handle_da_de_suffix(self, prev_token: str, suffixes: List[str]) -> str: |
| """Handle da/de/ta/te suffix selection.""" |
| if self._ends_with_sert_unsuz(prev_token): |
| return suffixes[3] if self._ends_with_ince(prev_token) else suffixes[2] |
| return suffixes[1] if self._ends_with_ince(prev_token) else suffixes[0] |
|
|
| def _handle_di_du_suffix(self, prev_token: str, suffixes: List[str]) -> str: |
| """Handle dı/di/du/dü suffix selection.""" |
| base_index = self._get_vowel_suffix_index(prev_token) |
| return ( |
| suffixes[base_index + 4] |
| if self._ends_with_sert_unsuz(prev_token) |
| else suffixes[base_index] |
| ) |
|
|
| def _handle_lik_suffix( |
| self, i: int, ids: List[int], prev_token: str, suffixes: List[str] |
| ) -> str: |
| """Handle lık/lik/luk/lük suffix selection.""" |
| if i >= len(ids) - 1: |
| return suffixes[0] |
|
|
| next_token = self.reverse_dict[ids[i + 1]][0] |
| base_index = self._get_vowel_suffix_index(prev_token) |
| return ( |
| suffixes[base_index + 4] |
| if self._starts_with_vowel(next_token) |
| else suffixes[base_index] |
| ) |
|
|
| def _handle_cik_suffix( |
| self, i: int, ids: List[int], prev_token: str, suffixes: List[str] |
| ) -> str: |
| """Handle cık/cik/cuk/cük suffix selection.""" |
| if i >= len(ids) - 1: |
| return suffixes[0] |
|
|
| next_token = self.reverse_dict[ids[i + 1]][0] |
| base_index = self._get_vowel_suffix_index(prev_token) |
|
|
| if self._starts_with_vowel(next_token): |
| offset = 12 if self._ends_with_sert_unsuz(prev_token) else 8 |
| else: |
| offset = 4 if self._ends_with_sert_unsuz(prev_token) else 0 |
|
|
| return suffixes[base_index + offset] |
|
|
| def _handle_mak_suffix( |
| self, i: int, ids: List[int], prev_token: str, suffixes: List[str] |
| ) -> str: |
| """Handle mak/mek/may/mey suffix selection.""" |
| if i >= len(ids) - 1: |
| return suffixes[0] |
|
|
| next_token = self.reverse_dict[ids[i + 1]][0] |
| base_index = 1 if self._ends_with_ince(prev_token) else 0 |
| return ( |
| suffixes[base_index + 2] |
| if self._starts_with_vowel(next_token) |
| else suffixes[base_index] |
| ) |
|
|
| def _handle_acak_suffix( |
| self, i: int, ids: List[int], prev_token: str, suffixes: List[str] |
| ) -> str: |
| """Handle acak/ecek/yacak/yecek suffix selection.""" |
| is_vowel_ending = self._ends_with_vowel(prev_token) |
| is_ince = self._ends_with_ince(prev_token) |
|
|
| is_vowel_starting = False |
| if i < len(ids) - 1: |
| next_token = self.reverse_dict[ids[i + 1]][0] |
| is_vowel_starting = self._starts_with_vowel(next_token) |
|
|
| if is_vowel_starting: |
| if is_vowel_ending: |
| return suffixes[7] if is_ince else suffixes[6] |
| else: |
| return suffixes[3] if is_ince else suffixes[2] |
| else: |
| if is_vowel_ending: |
| return suffixes[5] if is_ince else suffixes[4] |
| else: |
| return suffixes[1] if is_ince else suffixes[0] |
|
|
| def _select_correct_root(self, i: int, ids: List[int]) -> str: |
| """Select the correct root form based on morphological context.""" |
| token_id = ids[i] |
| tokens = self.reverse_dict[token_id] |
|
|
| if i > len(ids) - 2: |
| return tokens[0] |
|
|
| next_token = self.reverse_dict[ids[i + 1]][0] |
|
|
| |
| |
| |
| NO_SOFTENING_ROOTS = { |
| 204, |
| 220, |
| 298, |
| } |
| if token_id in NO_SOFTENING_ROOTS: |
| return tokens[0] |
|
|
| |
| |
| DEFAULT_VARIANT_1_ROOTS = { |
| 2227, |
| 2209, |
| } |
|
|
| |
| if token_id == 2227: |
| return tokens[1] if len(tokens) > 1 else tokens[0] |
|
|
| |
| |
| if token_id == 2199: |
| if i < len(ids) - 1: |
| next_str = self.reverse_dict[ids[i + 1]][0] |
| |
| if next_str.startswith("a") or next_str.startswith("e"): |
| return tokens[0] |
| |
| return tokens[1] if len(tokens) > 1 else tokens[0] |
|
|
| |
| |
| if token_id == 2212: |
| if len(tokens) > 1 and i < len(ids) - 1: |
| next_str = self.reverse_dict[ids[i + 1]][0] |
| if next_str.strip().startswith("l") or next_str.strip().startswith("y"): |
| return tokens[ |
| 1 |
| ] |
| return tokens[0] |
|
|
| |
| if token_id == 2209: |
| if i < len(ids) - 1: |
| |
| if ids[i + 1] == 20188: |
| return tokens[1] if len(tokens) > 1 else tokens[0] |
| return tokens[0] |
|
|
| |
| |
| if token_id == 182: |
| if i < len(ids) - 1: |
| next_id = ids[i + 1] |
| |
| |
| if next_id in (20034, 20033, 20035, 20036): |
| return tokens[1] if len(tokens) > 1 else tokens[0] |
| |
| return tokens[0] |
|
|
| |
| |
| if token_id == 166: |
| if len(tokens) > 1 and i < len(ids) - 1: |
| next_id = ids[i + 1] |
| |
| if next_id in (20033, 20038): |
| return tokens[1] |
| return tokens[0] |
|
|
| |
| |
| |
| if token_id == 2220: |
| if len(tokens) > 1 and i < len(ids) - 1: |
| next_id = ids[i + 1] |
| next_str = self.reverse_dict[next_id][0] |
| |
| |
| if ( |
| next_id >= 20000 |
| and next_str.strip() |
| and next_str.strip()[0] in "nmld" |
| ): |
| return tokens[0] |
| return tokens[1] if len(tokens) > 1 else tokens[0] |
|
|
| |
| |
| if token_id == 2307: |
| if len(tokens) > 1 and i < len(ids) - 1: |
| next_str = self.reverse_dict[ids[i + 1]][0] |
| if next_str.strip().startswith("r") or next_str.strip() == "yor": |
| return tokens[ |
| 1 |
| ] |
| return tokens[0] |
|
|
| |
| if token_id == 2218: |
| if i < len(ids) - 1: |
| next_id = ids[i + 1] |
| next_str = self.reverse_dict[next_id][0] |
| |
| |
| if ( |
| next_id == 20040 |
| or next_str.startswith("n") |
| or next_id in (20017, 32725, 20002, 32763) |
| ): |
| return tokens[0] |
| |
| return tokens[1] if len(tokens) > 1 else tokens[0] |
|
|
| |
| |
| |
| if token_id == 336: |
| if len(tokens) > 1 and i < len(ids) - 1: |
| next_str = self.reverse_dict[ids[i + 1]][0] |
| |
| |
| |
| if self._starts_with_vowel(next_str): |
| return tokens[0] |
| return tokens[0] |
|
|
| |
| if token_id == 2295: |
| if i < len(ids) - 1: |
| next_id = ids[i + 1] |
| |
| if next_id == 20041: |
| return tokens[0] |
| |
| return tokens[1] if len(tokens) > 1 else tokens[0] |
|
|
| |
| |
| |
| if token_id == 107: |
| if len(tokens) > 2 and i < len(ids) - 1: |
| next_str = self.reverse_dict[ids[i + 1]][0] |
| |
| if next_str.strip().startswith("u"): |
| return tokens[2] |
| |
| return tokens[0] |
|
|
| |
| |
| if token_id == 2206: |
| if len(tokens) > 1 and i < len(ids) - 1: |
| next_id = ids[i + 1] |
| |
| if next_id in (20005, 20047, 20207): |
| return tokens[1] |
| |
|
|
| |
| |
| if token_id == 2406: |
| if len(tokens) > 1 and i < len(ids) - 1: |
| next_str = self.reverse_dict[ids[i + 1]][0] |
| if self._starts_with_vowel(next_str.strip()): |
| return tokens[1] |
| return tokens[0] |
| if token_id in [19531, 19968]: |
| |
| |
| should_narrow = False |
|
|
| if next_token.strip() == "yor": |
| should_narrow = True |
| elif ids[i + 1] in self.reverse_dict: |
| |
| |
| suff_forms = self.reverse_dict[ids[i + 1]] |
| if suff_forms and any( |
| s.startswith(("a", "e", "ı", "i", "u", "ü", "o", "ö")) |
| for s in suff_forms |
| ): |
| should_narrow = True |
|
|
| if should_narrow: |
| |
| |
| original = tokens[0] |
| if original.endswith("e"): |
| return original[:-1] + "i" |
| elif original.endswith("E"): |
| return original[:-1] + "İ" |
| return tokens[0] |
|
|
| if 100 <= token_id < 2080: |
| |
| if self._starts_with_vowel(next_token): |
| return tokens[1] |
| elif token_id <= 110 and next_token.strip() == "ı": |
| return tokens[2] |
| else: |
| return tokens[0] |
|
|
| elif 2080 <= token_id < 2315: |
| if next_token.strip() == "yor": |
| return tokens[1] |
| else: |
| return tokens[0] |
|
|
| return tokens[0] |
|
|
| def decode(self, ids: List[int]) -> str: |
| """Decode a list of token IDs to text.""" |
| if not ids: |
| return "" |
|
|
| text_parts = [] |
| i = 0 |
|
|
| while i < len(ids): |
| token_id = ids[i] |
| |
| if token_id == 0 and i < len(ids) - 1: |
| next_token = self._select_correct_root(i + 1, ids) |
| if next_token.startswith(" "): |
| text_parts.append(" " + self._tr_capitalize(next_token.lstrip())) |
| else: |
| text_parts.append(self._tr_capitalize(next_token)) |
| i += 2 |
| continue |
| elif token_id == 1: |
| text_parts.append("▁u▁") |
| elif token_id in self.reverse_dict: |
| tokens = self.reverse_dict[token_id] |
| if len(tokens) > 1: |
| if token_id < 20000: |
| text_parts.append(self._select_correct_root(i, ids)) |
| else: |
| |
| |
| |
| prev_token = "" |
| j = len(text_parts) - 1 |
| tokens_found = 0 |
|
|
| |
| temp_context = [] |
| while j >= 0 and tokens_found < 3: |
| part = text_parts[j] |
| temp_context.insert(0, part) |
| if any(c.isalpha() for c in part): |
| tokens_found += 1 |
| j -= 1 |
|
|
| if temp_context: |
| prev_token = "".join(temp_context) |
|
|
| text_parts.append( |
| self._select_correct_suffix(i, ids, prev_token) |
| ) |
| else: |
| text_parts.append(tokens[0]) |
| else: |
| text_parts.append("▁") |
|
|
| i += 1 |
|
|
| return "".join(text_parts) |
|
|
|
|
| class TokenType(Enum): |
| ROOT = "ROOT" |
| SUFFIX = "SUFFIX" |
| BPE = "BPE" |
|
|
|
|
| class TurkishTokenizer: |
| def __init__(self, kokler_file=None, ekler_file=None, bpe_file=None): |
| |
| package_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
| |
| if kokler_file is None: |
| kokler_file = os.path.join(package_dir, "vocabs/kokler.json") |
| if ekler_file is None: |
| ekler_file = os.path.join(package_dir, "vocabs/ekler.json") |
| if bpe_file is None: |
| bpe_file = os.path.join(package_dir, "vocabs/bpe_tokenler.json") |
|
|
| |
| with open(kokler_file, "r", encoding="utf-8") as f: |
| roots = json.load(f) |
| with open(ekler_file, "r", encoding="utf-8") as f: |
| suffixes = json.load(f) |
| with open(bpe_file, "r", encoding="utf-8") as f: |
| bpe_tokens = json.load(f) |
|
|
| |
| self.roots = roots |
| self.suffixes = suffixes |
| self.bpe_tokens = bpe_tokens |
|
|
| |
| self.vocab = self.get_vocab() |
| self.reverse_dict = {} |
|
|
| |
| def add_to_reverse(source_dict): |
| for key, value in source_dict.items(): |
| if value not in self.reverse_dict: |
| self.reverse_dict[value] = [] |
| |
| if key not in self.reverse_dict[value]: |
| self.reverse_dict[value].append(key) |
|
|
| add_to_reverse(self.roots) |
| add_to_reverse(self.suffixes) |
| add_to_reverse(self.bpe_tokens) |
|
|
| self.decoder = TurkishDecoder(self.reverse_dict) |
|
|
| self.vocab_size = len(self.reverse_dict) |
|
|
| self.max_root_len = max(len(k) for k in roots) if roots else 0 |
| self.max_suffix_len = max(len(k) for k in suffixes) if suffixes else 0 |
| self.max_bpe_len = max(len(k) for k in bpe_tokens) if bpe_tokens else 0 |
|
|
| |
| |
| self.bpe_tokens_filtered = { |
| k: v for k, v in bpe_tokens.items() if k not in suffixes |
| } |
|
|
| self.uppercase_marker = { |
| "token": "<uppercase>", |
| "id": roots["<uppercase>"], |
| "type": TokenType.ROOT, |
| } |
| self.unknown_marker = { |
| "token": "<unknown>", |
| "id": roots["<unknown>"], |
| "type": TokenType.ROOT, |
| } |
| self.space_marker = {"token": " ", "id": roots[" "], "type": TokenType.ROOT} |
|
|
| |
| self.pad_token = "<pad>" |
| self.eos_token = "<eos>" |
| self.pad_token_id = roots[self.pad_token] |
| self.eos_token_id = roots[self.eos_token] |
|
|
| |
| def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: |
| return [self.vocab[token] for token in tokens] |
|
|
| |
| def convert_ids_to_tokens(self, ids: List[int]) -> List[str]: |
| |
| return [self.reverse_dict[id][0] for id in ids] |
|
|
| def get_vocab(self) -> Dict[str, int]: |
| |
| return {**self.roots, **self.bpe_tokens, **self.suffixes} |
|
|
| def _tokenize_word(self, word: str) -> Tuple[List[dict], List[int]]: |
| uppercase_indices = [i for i, c in enumerate(word) if c.isupper()] |
| result = [] |
|
|
| segments = self._camel_split_with_positions(word) |
|
|
| for seg, orig_pos in segments: |
| if orig_pos < len(word) and word[orig_pos].isupper(): |
| result.append(self.uppercase_marker) |
|
|
| |
| should_add_space = orig_pos == 0 and not seg.isspace() |
|
|
| if should_add_space: |
| seg = " " + seg |
|
|
| s = self._tr_lower(seg) |
| pos = 0 |
|
|
| while pos < len(s): |
| substr = s[pos:] |
|
|
| r_matches = self._all_prefix_matches( |
| substr, self.roots, self.max_root_len |
| ) |
| b_matches = self._all_prefix_matches( |
| substr, self.bpe_tokens_filtered, self.max_bpe_len |
| ) |
| s_matches = self._all_prefix_matches( |
| substr, self.suffixes, self.max_suffix_len |
| ) |
|
|
| candidates = [] |
| for r_id, r_tok in r_matches: |
| candidates.append(("ROOT", r_tok, r_id, len(r_tok), TokenType.ROOT)) |
| for b_id, b_tok in b_matches: |
| candidates.append(("BPE", b_tok, b_id, len(b_tok), TokenType.BPE)) |
| for s_id, s_tok in s_matches: |
| candidates.append( |
| ("SUFFIX", s_tok, s_id, len(s_tok), TokenType.SUFFIX) |
| ) |
|
|
| if not candidates: |
| result.append(self.unknown_marker) |
| pos += 1 |
| continue |
|
|
| best_candidate = None |
| best_score = -1 |
|
|
| for c_type, c_tok, c_id, c_len, c_enum in candidates: |
| score = c_len |
| remainder = substr[c_len:] |
|
|
| if not remainder: |
| |
| score += 5 |
| else: |
| |
| s_next_id, s_next_tok = self._longest_prefix_lookup( |
| remainder, self.suffixes, self.max_suffix_len |
| ) |
| if s_next_id is not None: |
| |
| if len(s_next_tok) > 1: |
| score += len(s_next_tok) |
|
|
| if score > best_score: |
| best_score = score |
| best_candidate = (c_tok, c_id, c_enum) |
| elif score == best_score: |
| |
| if ( |
| c_enum == TokenType.ROOT |
| and best_candidate[2] != TokenType.ROOT |
| ): |
| best_candidate = (c_tok, c_id, c_enum) |
| elif ( |
| c_enum == TokenType.SUFFIX |
| and best_candidate[2] == TokenType.BPE |
| ): |
| best_candidate = (c_tok, c_id, c_enum) |
|
|
| result.append( |
| { |
| "token": best_candidate[0], |
| "id": best_candidate[1], |
| "type": best_candidate[2], |
| } |
| ) |
| pos += len(best_candidate[0]) |
| continue |
|
|
| result.append(self.unknown_marker) |
| pos += 1 |
|
|
| return result, uppercase_indices |
|
|
| def tokenize_text(self, text: str) -> Tuple[List[dict], List[int]]: |
| final_tokens = [] |
| uppercase_indices = [i for i, c in enumerate(text) if c.isupper()] |
|
|
| parts = text.split(" ") |
| for idx, part in enumerate(parts): |
| part = part.strip() |
| |
| if part.strip(): |
| tokens, _ = self._tokenize_word(part) |
|
|
| cleaned_tokens = [] |
| for i, token in enumerate(tokens): |
|
|
| if ( |
| i >= 2 |
| and not (0 <= token["id"] <= 19999) |
| and tokens[i - 2] == self.uppercase_marker |
| and tokens[i - 1] == self.space_marker |
| ): |
| cleaned_tokens.pop(-1) |
|
|
| |
| if ( |
| token == self.uppercase_marker |
| and len(cleaned_tokens) > 0 |
| and cleaned_tokens[-1] == self.space_marker |
| ): |
| should_pop = False |
| if i + 1 < len(tokens): |
| next_tok_str = tokens[i + 1]["token"] |
| if next_tok_str.startswith(" "): |
| should_pop = True |
|
|
| if should_pop: |
| cleaned_tokens.pop() |
| cleaned_tokens.append(token) |
|
|
| final_tokens.extend(cleaned_tokens) |
|
|
| return final_tokens, uppercase_indices |
|
|
| def encode(self, text: str) -> List[int]: |
| tokens, _ = self.tokenize_text(text) |
| return [t["id"] for t in tokens] |
|
|
| def tokenize(self, text: str) -> List[str]: |
| tokens, _ = self.tokenize_text(text) |
| return [t["token"] for t in tokens] |
|
|
| def _longest_prefix_lookup( |
| self, s: str, table: Dict[str, int], max_len: int = None |
| ) -> Tuple[Optional[int], str]: |
| end = min(len(s), max_len) if max_len else len(s) |
| for i in range(end, 0, -1): |
| cand = s[:i] |
| if cand in table: |
| return table[cand], cand |
| return None, "" |
|
|
| def _all_prefix_matches( |
| self, s: str, table: Dict[str, int], max_len: int = None |
| ) -> List[Tuple[int, str]]: |
| matches = [] |
| end = min(len(s), max_len) if max_len else len(s) |
| for i in range(end, 0, -1): |
| prefix = s[:i] |
| if prefix in table: |
| matches.append((table[prefix], prefix)) |
| return matches |
|
|
| def _tr_lower(self, word: str) -> str: |
| if "I" in word or "İ" in word: |
| word = word.replace("İ", "i").replace("I", "ı") |
| return word.lower() |
|
|
| def _camel_split_with_positions(self, word: str) -> List[Tuple[str, int]]: |
| if not word: |
| return [] |
|
|
| parts = [] |
| start = 0 |
|
|
| for i in range(1, len(word)): |
| if word[i].isupper(): |
| if start < i: |
| parts.append((self._tr_lower(word[start:i]), start)) |
| start = i |
|
|
| if start < len(word): |
| parts.append((self._tr_lower(word[start:]), start)) |
|
|
| return parts if parts else [(self._tr_lower(word), 0)] |
|
|
| def decode(self, ids: List[int]) -> str: |
| return self.decoder.decode(ids) |
|
|
| |
| def __call__(self, text: str) -> Dict[str, List[int]]: |
| input_ids = self.encode(text) |
| attention_mask = [1 for _ in input_ids] |
| return {"input_ids": input_ids, "attention_mask": attention_mask} |
|
|
|
|
| class TurkishMFTTokenizerHF(PreTrainedTokenizer): |
| """ |
| Hugging Face Transformers capable wrapper. |
| The main tokenization logic is in TurkishTokenizer. |
| """ |
|
|
| model_input_names = ["input_ids", "attention_mask"] |
|
|
| vocab_files_names = { |
| "kokler_file": "vocabs/kokler.json", |
| "ekler_file": "vocabs/ekler.json", |
| "bpe_file": "vocabs/bpe_tokenler.json", |
| } |
|
|
| def __init__( |
| self, |
| kokler_file=None, |
| ekler_file=None, |
| bpe_file=None, |
| **kwargs, |
| ): |
| |
| self._tok = TurkishTokenizer( |
| kokler_file=kokler_file, ekler_file=ekler_file, bpe_file=bpe_file |
| ) |
|
|
| |
| self._vocab = self._tok.get_vocab() |
|
|
| |
| |
| super().__init__( |
| pad_token=kwargs.pop("pad_token", "<pad>"), |
| eos_token=kwargs.pop("eos_token", "<eos>"), |
| unk_token=kwargs.pop("unk_token", "<unknown>"), |
| **kwargs, |
| ) |
|
|
| |
| self.pad_token_id = self._tok.pad_token_id |
| self.eos_token_id = self._tok.eos_token_id |
| self.unk_token_id = self._tok.roots.get("<unknown>", 1) |
|
|
| @property |
| def vocab_size(self) -> int: |
| return len(self._tok.reverse_dict) |
|
|
| def get_vocab(self) -> Dict[str, int]: |
| return dict(self._vocab) |
|
|
| |
| def _tokenize(self, text: str) -> List[str]: |
| |
| |
| return self._tok.tokenize(text) |
|
|
| def _convert_token_to_id(self, token: str) -> int: |
| return self._vocab.get(token, self.unk_token_id) |
|
|
| def _convert_id_to_token(self, index: int) -> str: |
| |
| if index in self._tok.reverse_dict: |
| return self._tok.reverse_dict[index][0] |
| return self.unk_token |
|
|
| def convert_tokens_to_string(self, tokens: List[str]) -> str: |
| |
| |
| ids = [self._convert_token_to_id(t) for t in tokens] |
| return self._tok.decode(ids) |
|
|
| def build_inputs_with_special_tokens( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| ) -> List[int]: |
| |
| if token_ids_1 is None: |
| return token_ids_0 |
| return token_ids_0 + token_ids_1 |
|
|
| def save_vocabulary( |
| self, save_directory: str, filename_prefix: Optional[str] = None |
| ) -> Tuple[str, ...]: |
| |
| vocabs_dir = os.path.join(save_directory, "vocabs") |
| os.makedirs(vocabs_dir, exist_ok=True) |
|
|
| prefix = filename_prefix or "" |
|
|
| |
| kokler_path = os.path.join(vocabs_dir, f"{prefix}kokler.json") |
| ekler_path = os.path.join(vocabs_dir, f"{prefix}ekler.json") |
| bpe_path = os.path.join(vocabs_dir, f"{prefix}bpe_tokenler.json") |
|
|
| with open(kokler_path, "w", encoding="utf-8") as f: |
| json.dump(self._tok.roots, f, ensure_ascii=False, indent=2) |
|
|
| with open(ekler_path, "w", encoding="utf-8") as f: |
| json.dump(self._tok.suffixes, f, ensure_ascii=False, indent=2) |
|
|
| with open(bpe_path, "w", encoding="utf-8") as f: |
| json.dump(self._tok.bpe_tokens, f, ensure_ascii=False, indent=2) |
|
|
| return (kokler_path, ekler_path, bpe_path) |
|
|
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): |
| |
| return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) |
|
|