alibayram
/

turkish-mft-tokenizer

alibayram commited on Feb 3

Commit

634f92b

1 Parent(s): 5b0462c

Enhance save_vocabulary method to create a subdirectory for vocab files and save multiple vocabulary formats

Files changed (1) hide show

tokenization_turkish_mft.py CHANGED Viewed

@@ -865,13 +865,28 @@ class TurkishMFTTokenizerHF(PreTrainedTokenizer):
     def save_vocabulary(
         self, save_directory: str, filename_prefix: Optional[str] = None
-    ):
-        # This tokenizer lives with JSON resources.
-        # Simple dump vocab, but not strictly required as resources are loaded from files.
-        path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json")
-        with open(path, "w", encoding="utf-8") as f:
-            json.dump(self.get_vocab(), f, ensure_ascii=False)
-        return (path,)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):

     def save_vocabulary(
         self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> Tuple[str, ...]:
+        # Create vocabs subdirectory
+        vocabs_dir = os.path.join(save_directory, "vocabs")
+        os.makedirs(vocabs_dir, exist_ok=True)
+        prefix = filename_prefix or ""
+        # Save all three vocabulary files
+        kokler_path = os.path.join(vocabs_dir, f"{prefix}kokler.json")
+        ekler_path = os.path.join(vocabs_dir, f"{prefix}ekler.json")
+        bpe_path = os.path.join(vocabs_dir, f"{prefix}bpe_tokenler.json")
+        with open(kokler_path, "w", encoding="utf-8") as f:
+            json.dump(self._tok.roots, f, ensure_ascii=False, indent=2)
+        with open(ekler_path, "w", encoding="utf-8") as f:
+            json.dump(self._tok.suffixes, f, ensure_ascii=False, indent=2)
+        with open(bpe_path, "w", encoding="utf-8") as f:
+            json.dump(self._tok.bpe_tokens, f, ensure_ascii=False, indent=2)
+        return (kokler_path, ekler_path, bpe_path)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):