elshadrahimov
/

miLLi-1.0

@@ -6,6 +6,14 @@ from transformers import PreTrainedTokenizer
 from tokenizers import Tokenizer
 from huggingface_hub import hf_hub_download
 class MiLLiTokenizer(PreTrainedTokenizer):
     """miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
@@ -21,7 +29,6 @@ class MiLLiTokenizer(PreTrainedTokenizer):
         eos_token="<EOS>",
         **kwargs
     ):
         local_dir = os.path.dirname(__file__)
         local_trie_path = os.path.join(local_dir, trie_file)
         local_bpe_path = os.path.join(local_dir, bpe_file)
@@ -32,22 +39,19 @@ class MiLLiTokenizer(PreTrainedTokenizer):
             try:
                 self.trie_path = hf_hub_download(repo_id=repo_id, filename=trie_file)
             except Exception as e:
-                raise FileNotFoundError(f"Trie faylını Hub-dan yükləmək olmadı: {e}")
         if os.path.exists(local_bpe_path):
             self.bpe_path = local_bpe_path
         else:
             try:
                 self.bpe_path = hf_hub_download(repo_id=repo_id, filename=bpe_file)
             except Exception as e:
-                raise FileNotFoundError(f"BPE faylını Hub-dan yükləmək olmadı: {e}")
         with open(self.trie_path, "rb") as f:
             self.trie = pickle.load(f)
         self.bpe_backend = Tokenizer.from_file(self.bpe_path)
         super().__init__(
@@ -67,11 +71,13 @@ class MiLLiTokenizer(PreTrainedTokenizer):
     def _normalize_phonology(self, sub_word: str) -> Optional[str]:
         if "_" + sub_word in self.trie:
             return sub_word
         replacements = {'ğ': 'q', 'y': 'k'}
         if sub_word and sub_word[-1] in replacements:
             restored = sub_word[:-1] + replacements[sub_word[-1]]
             if "_" + restored in self.trie:
                 return restored
         vowels = ['ı', 'i', 'u', 'ü']
         if len(sub_word) >= 2:
             for v in vowels:
@@ -86,14 +92,17 @@ class MiLLiTokenizer(PreTrainedTokenizer):
         for token in raw_tokens:
             if not re.match(r'\w+', token):
                 processed_parts.append(token); continue
             is_upper = token[0].isupper()
             word = token.lower()
             if is_upper: processed_parts.append("<UPPER>")
             found_root, best_root_len = None, 0
             for i in range(len(word), 0, -1):
                 root = self._normalize_phonology(word[:i])
                 if root:
                     found_root = root; best_root_len = i; break
             if found_root:
                 processed_parts.append("_" + found_root)
                 remainder = word[best_root_len:]

 from tokenizers import Tokenizer
 from huggingface_hub import hf_hub_download
+try:
+    import ahocorasick
+except ImportError:
+    raise ImportError(
+        "This tokenizer requires the 'pyahocorasick' library. "
+        "Please install it using: pip install pyahocorasick"
+    )
 class MiLLiTokenizer(PreTrainedTokenizer):
     """miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
         eos_token="<EOS>",
         **kwargs
     ):
         local_dir = os.path.dirname(__file__)
         local_trie_path = os.path.join(local_dir, trie_file)
         local_bpe_path = os.path.join(local_dir, bpe_file)
             try:
                 self.trie_path = hf_hub_download(repo_id=repo_id, filename=trie_file)
             except Exception as e:
+                raise FileNotFoundError(f"Could not download Trie file from Hub: {e}")
         if os.path.exists(local_bpe_path):
             self.bpe_path = local_bpe_path
         else:
             try:
                 self.bpe_path = hf_hub_download(repo_id=repo_id, filename=bpe_file)
             except Exception as e:
+                raise FileNotFoundError(f"Could not download BPE file from Hub: {e}")
         with open(self.trie_path, "rb") as f:
             self.trie = pickle.load(f)
         self.bpe_backend = Tokenizer.from_file(self.bpe_path)
         super().__init__(
     def _normalize_phonology(self, sub_word: str) -> Optional[str]:
         if "_" + sub_word in self.trie:
             return sub_word
         replacements = {'ğ': 'q', 'y': 'k'}
         if sub_word and sub_word[-1] in replacements:
             restored = sub_word[:-1] + replacements[sub_word[-1]]
             if "_" + restored in self.trie:
                 return restored
         vowels = ['ı', 'i', 'u', 'ü']
         if len(sub_word) >= 2:
             for v in vowels:
         for token in raw_tokens:
             if not re.match(r'\w+', token):
                 processed_parts.append(token); continue
             is_upper = token[0].isupper()
             word = token.lower()
             if is_upper: processed_parts.append("<UPPER>")
             found_root, best_root_len = None, 0
             for i in range(len(word), 0, -1):
                 root = self._normalize_phonology(word[:i])
                 if root:
                     found_root = root; best_root_len = i; break
             if found_root:
                 processed_parts.append("_" + found_root)
                 remainder = word[best_root_len:]