elshadrahimov commited on
Commit
cd11bc3
·
verified ·
1 Parent(s): 95d95f2

Update tokenization_milli.py

Browse files
Files changed (1) hide show
  1. tokenization_milli.py +28 -15
tokenization_milli.py CHANGED
@@ -4,6 +4,7 @@ import pickle
4
  from typing import List, Optional, Tuple
5
  from transformers import PreTrainedTokenizer
6
  from tokenizers import Tokenizer
 
7
 
8
  class MiLLiTokenizer(PreTrainedTokenizer):
9
  """miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
@@ -12,30 +13,42 @@ class MiLLiTokenizer(PreTrainedTokenizer):
12
 
13
  def __init__(
14
  self,
15
- trie_file=None,
16
- bpe_file=None,
 
17
  unk_token="<UNK>",
18
  pad_token="<PAD>",
19
  eos_token="<EOS>",
20
  **kwargs
21
  ):
22
- # Faylların olduğu qovluğu tapırıq (Hugging Face cache qovluğu)
23
- current_dir = os.path.dirname(__file__)
24
 
25
- # Fayl yollarını mütləq yola çeviririk
26
- resolved_trie_path = os.path.join(current_dir, "milli_trie_v1.pkl")
27
- resolved_bpe_path = os.path.join(current_dir, "milli_bpe_v1.json")
28
 
29
- # 1. Trie lüğətini yükləyirik
30
- if not os.path.exists(resolved_trie_path):
31
- raise FileNotFoundError(f"Trie faylı tapılmadı: {resolved_trie_path}")
32
- with open(resolved_trie_path, "rb") as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  self.trie = pickle.load(f)
34
 
35
- # 2. BPE modelini yükləyirik
36
- if not os.path.exists(resolved_bpe_path):
37
- raise FileNotFoundError(f"BPE faylı tapılmadı: {resolved_bpe_path}")
38
- self.bpe_backend = Tokenizer.from_file(resolved_bpe_path)
39
 
40
  super().__init__(
41
  unk_token=unk_token,
 
4
  from typing import List, Optional, Tuple
5
  from transformers import PreTrainedTokenizer
6
  from tokenizers import Tokenizer
7
+ from huggingface_hub import hf_hub_download
8
 
9
  class MiLLiTokenizer(PreTrainedTokenizer):
10
  """miLLi 1.0: Azerbaijani Hybrid Tokenizer with Phonological Restoration"""
 
13
 
14
  def __init__(
15
  self,
16
+ repo_id="elshadrahimov/miLLi-1.0", # Repozitoriyanızın adı bura yazılır
17
+ trie_file="milli_trie_v1.pkl",
18
+ bpe_file="milli_bpe_v1.json",
19
  unk_token="<UNK>",
20
  pad_token="<PAD>",
21
  eos_token="<EOS>",
22
  **kwargs
23
  ):
 
 
24
 
25
+ local_dir = os.path.dirname(__file__)
26
+ local_trie_path = os.path.join(local_dir, trie_file)
27
+ local_bpe_path = os.path.join(local_dir, bpe_file)
28
 
29
+ if os.path.exists(local_trie_path):
30
+ self.trie_path = local_trie_path
31
+ else:
32
+ try:
33
+ self.trie_path = hf_hub_download(repo_id=repo_id, filename=trie_file)
34
+ except Exception as e:
35
+ raise FileNotFoundError(f"Trie faylını Hub-dan yükləmək olmadı: {e}")
36
+
37
+ if os.path.exists(local_bpe_path):
38
+ self.bpe_path = local_bpe_path
39
+ else:
40
+
41
+ try:
42
+ self.bpe_path = hf_hub_download(repo_id=repo_id, filename=bpe_file)
43
+ except Exception as e:
44
+ raise FileNotFoundError(f"BPE faylını Hub-dan yükləmək olmadı: {e}")
45
+
46
+
47
+ with open(self.trie_path, "rb") as f:
48
  self.trie = pickle.load(f)
49
 
50
+
51
+ self.bpe_backend = Tokenizer.from_file(self.bpe_path)
 
 
52
 
53
  super().__init__(
54
  unk_token=unk_token,