elshadrahimov commited on
Commit
a9b0c69
·
verified ·
1 Parent(s): 8ebed99

Update tokenization_milli.py

Browse files
Files changed (1) hide show
  1. tokenization_milli.py +15 -6
tokenization_milli.py CHANGED
@@ -6,6 +6,14 @@ from transformers import PreTrainedTokenizer
6
  from tokenizers import Tokenizer
7
  from huggingface_hub import hf_hub_download
8
 
 
 
 
 
 
 
 
 
9
  class MiLLiTokenizer(PreTrainedTokenizer):
10
  """miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
11
 
@@ -21,7 +29,6 @@ class MiLLiTokenizer(PreTrainedTokenizer):
21
  eos_token="<EOS>",
22
  **kwargs
23
  ):
24
-
25
  local_dir = os.path.dirname(__file__)
26
  local_trie_path = os.path.join(local_dir, trie_file)
27
  local_bpe_path = os.path.join(local_dir, bpe_file)
@@ -32,22 +39,19 @@ class MiLLiTokenizer(PreTrainedTokenizer):
32
  try:
33
  self.trie_path = hf_hub_download(repo_id=repo_id, filename=trie_file)
34
  except Exception as e:
35
- raise FileNotFoundError(f"Trie faylını Hub-dan yükləmək olmadı: {e}")
36
 
37
  if os.path.exists(local_bpe_path):
38
  self.bpe_path = local_bpe_path
39
  else:
40
-
41
  try:
42
  self.bpe_path = hf_hub_download(repo_id=repo_id, filename=bpe_file)
43
  except Exception as e:
44
- raise FileNotFoundError(f"BPE faylını Hub-dan yükləmək olmadı: {e}")
45
 
46
-
47
  with open(self.trie_path, "rb") as f:
48
  self.trie = pickle.load(f)
49
 
50
-
51
  self.bpe_backend = Tokenizer.from_file(self.bpe_path)
52
 
53
  super().__init__(
@@ -67,11 +71,13 @@ class MiLLiTokenizer(PreTrainedTokenizer):
67
  def _normalize_phonology(self, sub_word: str) -> Optional[str]:
68
  if "_" + sub_word in self.trie:
69
  return sub_word
 
70
  replacements = {'ğ': 'q', 'y': 'k'}
71
  if sub_word and sub_word[-1] in replacements:
72
  restored = sub_word[:-1] + replacements[sub_word[-1]]
73
  if "_" + restored in self.trie:
74
  return restored
 
75
  vowels = ['ı', 'i', 'u', 'ü']
76
  if len(sub_word) >= 2:
77
  for v in vowels:
@@ -86,14 +92,17 @@ class MiLLiTokenizer(PreTrainedTokenizer):
86
  for token in raw_tokens:
87
  if not re.match(r'\w+', token):
88
  processed_parts.append(token); continue
 
89
  is_upper = token[0].isupper()
90
  word = token.lower()
91
  if is_upper: processed_parts.append("<UPPER>")
 
92
  found_root, best_root_len = None, 0
93
  for i in range(len(word), 0, -1):
94
  root = self._normalize_phonology(word[:i])
95
  if root:
96
  found_root = root; best_root_len = i; break
 
97
  if found_root:
98
  processed_parts.append("_" + found_root)
99
  remainder = word[best_root_len:]
 
6
  from tokenizers import Tokenizer
7
  from huggingface_hub import hf_hub_download
8
 
9
+ try:
10
+ import ahocorasick
11
+ except ImportError:
12
+ raise ImportError(
13
+ "This tokenizer requires the 'pyahocorasick' library. "
14
+ "Please install it using: pip install pyahocorasick"
15
+ )
16
+
17
  class MiLLiTokenizer(PreTrainedTokenizer):
18
  """miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
19
 
 
29
  eos_token="<EOS>",
30
  **kwargs
31
  ):
 
32
  local_dir = os.path.dirname(__file__)
33
  local_trie_path = os.path.join(local_dir, trie_file)
34
  local_bpe_path = os.path.join(local_dir, bpe_file)
 
39
  try:
40
  self.trie_path = hf_hub_download(repo_id=repo_id, filename=trie_file)
41
  except Exception as e:
42
+ raise FileNotFoundError(f"Could not download Trie file from Hub: {e}")
43
 
44
  if os.path.exists(local_bpe_path):
45
  self.bpe_path = local_bpe_path
46
  else:
 
47
  try:
48
  self.bpe_path = hf_hub_download(repo_id=repo_id, filename=bpe_file)
49
  except Exception as e:
50
+ raise FileNotFoundError(f"Could not download BPE file from Hub: {e}")
51
 
 
52
  with open(self.trie_path, "rb") as f:
53
  self.trie = pickle.load(f)
54
 
 
55
  self.bpe_backend = Tokenizer.from_file(self.bpe_path)
56
 
57
  super().__init__(
 
71
  def _normalize_phonology(self, sub_word: str) -> Optional[str]:
72
  if "_" + sub_word in self.trie:
73
  return sub_word
74
+
75
  replacements = {'ğ': 'q', 'y': 'k'}
76
  if sub_word and sub_word[-1] in replacements:
77
  restored = sub_word[:-1] + replacements[sub_word[-1]]
78
  if "_" + restored in self.trie:
79
  return restored
80
+
81
  vowels = ['ı', 'i', 'u', 'ü']
82
  if len(sub_word) >= 2:
83
  for v in vowels:
 
92
  for token in raw_tokens:
93
  if not re.match(r'\w+', token):
94
  processed_parts.append(token); continue
95
+
96
  is_upper = token[0].isupper()
97
  word = token.lower()
98
  if is_upper: processed_parts.append("<UPPER>")
99
+
100
  found_root, best_root_len = None, 0
101
  for i in range(len(word), 0, -1):
102
  root = self._normalize_phonology(word[:i])
103
  if root:
104
  found_root = root; best_root_len = i; break
105
+
106
  if found_root:
107
  processed_parts.append("_" + found_root)
108
  remainder = word[best_root_len:]