bioscan-ml
/

BarcodeBERT

@@ -1,4 +1,5 @@
 from transformers import PreTrainedTokenizer
 import json
 import os
 from itertools import product
@@ -109,7 +110,8 @@ class KmerTokenizer(PreTrainedTokenizer):
     @classmethod
     def from_pretrained(cls, pretrained_dir, **kwargs):
         # Load vocabulary
-        vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
         with open(vocab_file, "r", encoding="utf-8") as f:
             vocab_content = json.load(f)
             vocab = vocab_content["model"]["vocab"]

 from transformers import PreTrainedTokenizer
+from huggingface_hub import hf_hub_download
 import json
 import os
 from itertools import product
     @classmethod
     def from_pretrained(cls, pretrained_dir, **kwargs):
         # Load vocabulary
+        vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
+        # vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
         with open(vocab_file, "r", encoding="utf-8") as f:
             vocab_content = json.load(f)
             vocab = vocab_content["model"]["vocab"]