Enhance save_vocabulary method to create a subdirectory for vocab files and save multiple vocabulary formats
Browse files- tokenization_turkish_mft.py +22 -7
tokenization_turkish_mft.py
CHANGED
|
@@ -865,13 +865,28 @@ class TurkishMFTTokenizerHF(PreTrainedTokenizer):
|
|
| 865 |
|
| 866 |
def save_vocabulary(
|
| 867 |
self, save_directory: str, filename_prefix: Optional[str] = None
|
| 868 |
-
):
|
| 869 |
-
#
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
@classmethod
|
| 877 |
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
|
|
|
|
| 865 |
|
| 866 |
def save_vocabulary(
|
| 867 |
self, save_directory: str, filename_prefix: Optional[str] = None
|
| 868 |
+
) -> Tuple[str, ...]:
|
| 869 |
+
# Create vocabs subdirectory
|
| 870 |
+
vocabs_dir = os.path.join(save_directory, "vocabs")
|
| 871 |
+
os.makedirs(vocabs_dir, exist_ok=True)
|
| 872 |
+
|
| 873 |
+
prefix = filename_prefix or ""
|
| 874 |
+
|
| 875 |
+
# Save all three vocabulary files
|
| 876 |
+
kokler_path = os.path.join(vocabs_dir, f"{prefix}kokler.json")
|
| 877 |
+
ekler_path = os.path.join(vocabs_dir, f"{prefix}ekler.json")
|
| 878 |
+
bpe_path = os.path.join(vocabs_dir, f"{prefix}bpe_tokenler.json")
|
| 879 |
+
|
| 880 |
+
with open(kokler_path, "w", encoding="utf-8") as f:
|
| 881 |
+
json.dump(self._tok.roots, f, ensure_ascii=False, indent=2)
|
| 882 |
+
|
| 883 |
+
with open(ekler_path, "w", encoding="utf-8") as f:
|
| 884 |
+
json.dump(self._tok.suffixes, f, ensure_ascii=False, indent=2)
|
| 885 |
+
|
| 886 |
+
with open(bpe_path, "w", encoding="utf-8") as f:
|
| 887 |
+
json.dump(self._tok.bpe_tokens, f, ensure_ascii=False, indent=2)
|
| 888 |
+
|
| 889 |
+
return (kokler_path, ekler_path, bpe_path)
|
| 890 |
|
| 891 |
@classmethod
|
| 892 |
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
|