Update tokenization_milli.py
Browse files- tokenization_milli.py +15 -6
tokenization_milli.py
CHANGED
|
@@ -6,6 +6,14 @@ from transformers import PreTrainedTokenizer
|
|
| 6 |
from tokenizers import Tokenizer
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
class MiLLiTokenizer(PreTrainedTokenizer):
|
| 10 |
"""miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
|
| 11 |
|
|
@@ -21,7 +29,6 @@ class MiLLiTokenizer(PreTrainedTokenizer):
|
|
| 21 |
eos_token="<EOS>",
|
| 22 |
**kwargs
|
| 23 |
):
|
| 24 |
-
|
| 25 |
local_dir = os.path.dirname(__file__)
|
| 26 |
local_trie_path = os.path.join(local_dir, trie_file)
|
| 27 |
local_bpe_path = os.path.join(local_dir, bpe_file)
|
|
@@ -32,22 +39,19 @@ class MiLLiTokenizer(PreTrainedTokenizer):
|
|
| 32 |
try:
|
| 33 |
self.trie_path = hf_hub_download(repo_id=repo_id, filename=trie_file)
|
| 34 |
except Exception as e:
|
| 35 |
-
raise FileNotFoundError(f"
|
| 36 |
|
| 37 |
if os.path.exists(local_bpe_path):
|
| 38 |
self.bpe_path = local_bpe_path
|
| 39 |
else:
|
| 40 |
-
|
| 41 |
try:
|
| 42 |
self.bpe_path = hf_hub_download(repo_id=repo_id, filename=bpe_file)
|
| 43 |
except Exception as e:
|
| 44 |
-
raise FileNotFoundError(f"
|
| 45 |
|
| 46 |
-
|
| 47 |
with open(self.trie_path, "rb") as f:
|
| 48 |
self.trie = pickle.load(f)
|
| 49 |
|
| 50 |
-
|
| 51 |
self.bpe_backend = Tokenizer.from_file(self.bpe_path)
|
| 52 |
|
| 53 |
super().__init__(
|
|
@@ -67,11 +71,13 @@ class MiLLiTokenizer(PreTrainedTokenizer):
|
|
| 67 |
def _normalize_phonology(self, sub_word: str) -> Optional[str]:
|
| 68 |
if "_" + sub_word in self.trie:
|
| 69 |
return sub_word
|
|
|
|
| 70 |
replacements = {'ğ': 'q', 'y': 'k'}
|
| 71 |
if sub_word and sub_word[-1] in replacements:
|
| 72 |
restored = sub_word[:-1] + replacements[sub_word[-1]]
|
| 73 |
if "_" + restored in self.trie:
|
| 74 |
return restored
|
|
|
|
| 75 |
vowels = ['ı', 'i', 'u', 'ü']
|
| 76 |
if len(sub_word) >= 2:
|
| 77 |
for v in vowels:
|
|
@@ -86,14 +92,17 @@ class MiLLiTokenizer(PreTrainedTokenizer):
|
|
| 86 |
for token in raw_tokens:
|
| 87 |
if not re.match(r'\w+', token):
|
| 88 |
processed_parts.append(token); continue
|
|
|
|
| 89 |
is_upper = token[0].isupper()
|
| 90 |
word = token.lower()
|
| 91 |
if is_upper: processed_parts.append("<UPPER>")
|
|
|
|
| 92 |
found_root, best_root_len = None, 0
|
| 93 |
for i in range(len(word), 0, -1):
|
| 94 |
root = self._normalize_phonology(word[:i])
|
| 95 |
if root:
|
| 96 |
found_root = root; best_root_len = i; break
|
|
|
|
| 97 |
if found_root:
|
| 98 |
processed_parts.append("_" + found_root)
|
| 99 |
remainder = word[best_root_len:]
|
|
|
|
| 6 |
from tokenizers import Tokenizer
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
|
| 9 |
+
try:
|
| 10 |
+
import ahocorasick
|
| 11 |
+
except ImportError:
|
| 12 |
+
raise ImportError(
|
| 13 |
+
"This tokenizer requires the 'pyahocorasick' library. "
|
| 14 |
+
"Please install it using: pip install pyahocorasick"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
class MiLLiTokenizer(PreTrainedTokenizer):
|
| 18 |
"""miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
|
| 19 |
|
|
|
|
| 29 |
eos_token="<EOS>",
|
| 30 |
**kwargs
|
| 31 |
):
|
|
|
|
| 32 |
local_dir = os.path.dirname(__file__)
|
| 33 |
local_trie_path = os.path.join(local_dir, trie_file)
|
| 34 |
local_bpe_path = os.path.join(local_dir, bpe_file)
|
|
|
|
| 39 |
try:
|
| 40 |
self.trie_path = hf_hub_download(repo_id=repo_id, filename=trie_file)
|
| 41 |
except Exception as e:
|
| 42 |
+
raise FileNotFoundError(f"Could not download Trie file from Hub: {e}")
|
| 43 |
|
| 44 |
if os.path.exists(local_bpe_path):
|
| 45 |
self.bpe_path = local_bpe_path
|
| 46 |
else:
|
|
|
|
| 47 |
try:
|
| 48 |
self.bpe_path = hf_hub_download(repo_id=repo_id, filename=bpe_file)
|
| 49 |
except Exception as e:
|
| 50 |
+
raise FileNotFoundError(f"Could not download BPE file from Hub: {e}")
|
| 51 |
|
|
|
|
| 52 |
with open(self.trie_path, "rb") as f:
|
| 53 |
self.trie = pickle.load(f)
|
| 54 |
|
|
|
|
| 55 |
self.bpe_backend = Tokenizer.from_file(self.bpe_path)
|
| 56 |
|
| 57 |
super().__init__(
|
|
|
|
| 71 |
def _normalize_phonology(self, sub_word: str) -> Optional[str]:
|
| 72 |
if "_" + sub_word in self.trie:
|
| 73 |
return sub_word
|
| 74 |
+
|
| 75 |
replacements = {'ğ': 'q', 'y': 'k'}
|
| 76 |
if sub_word and sub_word[-1] in replacements:
|
| 77 |
restored = sub_word[:-1] + replacements[sub_word[-1]]
|
| 78 |
if "_" + restored in self.trie:
|
| 79 |
return restored
|
| 80 |
+
|
| 81 |
vowels = ['ı', 'i', 'u', 'ü']
|
| 82 |
if len(sub_word) >= 2:
|
| 83 |
for v in vowels:
|
|
|
|
| 92 |
for token in raw_tokens:
|
| 93 |
if not re.match(r'\w+', token):
|
| 94 |
processed_parts.append(token); continue
|
| 95 |
+
|
| 96 |
is_upper = token[0].isupper()
|
| 97 |
word = token.lower()
|
| 98 |
if is_upper: processed_parts.append("<UPPER>")
|
| 99 |
+
|
| 100 |
found_root, best_root_len = None, 0
|
| 101 |
for i in range(len(word), 0, -1):
|
| 102 |
root = self._normalize_phonology(word[:i])
|
| 103 |
if root:
|
| 104 |
found_root = root; best_root_len = i; break
|
| 105 |
+
|
| 106 |
if found_root:
|
| 107 |
processed_parts.append("_" + found_root)
|
| 108 |
remainder = word[best_root_len:]
|