alibayram commited on
Commit
634f92b
·
1 Parent(s): 5b0462c

Enhance save_vocabulary method to create a subdirectory for vocab files and save multiple vocabulary formats

Browse files
Files changed (1) hide show
  1. tokenization_turkish_mft.py +22 -7
tokenization_turkish_mft.py CHANGED
@@ -865,13 +865,28 @@ class TurkishMFTTokenizerHF(PreTrainedTokenizer):
865
 
866
  def save_vocabulary(
867
  self, save_directory: str, filename_prefix: Optional[str] = None
868
- ):
869
- # This tokenizer lives with JSON resources.
870
- # Simple dump vocab, but not strictly required as resources are loaded from files.
871
- path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json")
872
- with open(path, "w", encoding="utf-8") as f:
873
- json.dump(self.get_vocab(), f, ensure_ascii=False)
874
- return (path,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
 
876
  @classmethod
877
  def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
 
865
 
866
  def save_vocabulary(
867
  self, save_directory: str, filename_prefix: Optional[str] = None
868
+ ) -> Tuple[str, ...]:
869
+ # Create vocabs subdirectory
870
+ vocabs_dir = os.path.join(save_directory, "vocabs")
871
+ os.makedirs(vocabs_dir, exist_ok=True)
872
+
873
+ prefix = filename_prefix or ""
874
+
875
+ # Save all three vocabulary files
876
+ kokler_path = os.path.join(vocabs_dir, f"{prefix}kokler.json")
877
+ ekler_path = os.path.join(vocabs_dir, f"{prefix}ekler.json")
878
+ bpe_path = os.path.join(vocabs_dir, f"{prefix}bpe_tokenler.json")
879
+
880
+ with open(kokler_path, "w", encoding="utf-8") as f:
881
+ json.dump(self._tok.roots, f, ensure_ascii=False, indent=2)
882
+
883
+ with open(ekler_path, "w", encoding="utf-8") as f:
884
+ json.dump(self._tok.suffixes, f, ensure_ascii=False, indent=2)
885
+
886
+ with open(bpe_path, "w", encoding="utf-8") as f:
887
+ json.dump(self._tok.bpe_tokens, f, ensure_ascii=False, indent=2)
888
+
889
+ return (kokler_path, ekler_path, bpe_path)
890
 
891
  @classmethod
892
  def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):