saracandu
/

stldec_random_16_large

Text Generation

Model card Files Files and versions

saracandu commited on Aug 21, 2025

Commit

55951b4

·

verified ·

1 Parent(s): 886c8fb

Upload tokenizer

Files changed (1) hide show

tokenizer.py +12 -3

tokenizer.py CHANGED Viewed

@@ -8,8 +8,17 @@ import json
 logger = logging.get_logger(__name__)
-def load_json(path: str) -> Union[Dict, List]:
     """
     Load a JSON file from the given path.
     Args:
@@ -30,7 +39,7 @@ class STLTokenizer(PreTrainedTokenizer):
     and handle padding and special tokens.
     """
-    def __init__(self, vocab_path: str = 'vocab.json', unk_token: str = "unk", pad_token: str = "pad",
                  bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
         """
         Initializes the STLTokenizer with a given vocabulary and special tokens.
@@ -41,7 +50,7 @@ class STLTokenizer(PreTrainedTokenizer):
             bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
             eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
         """
-        self.vocab = load_json(vocab_path)
         self.unk_token = unk_token
         self.pad_token = pad_token
         self.bos_token = bos_token

 logger = logging.get_logger(__name__)
+from huggingface_hub import hf_hub_download
+import json
+import os
+def load_json(path, repo_id=None):
+    if repo_id:
+        path = hf_hub_download(repo_id, path)
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def load_json_old(path: str) -> Union[Dict, List]:
     """
     Load a JSON file from the given path.
     Args:
     and handle padding and special tokens.
     """
+    def __init__(self, vocab_path: str = "vocab.json", unk_token: str = "unk", pad_token: str = "pad",
                  bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
         """
         Initializes the STLTokenizer with a given vocabulary and special tokens.
             bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
             eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
         """
+        self.vocab = load_json("vocab.json", repo_id="saracandu/stldec_random_16")
         self.unk_token = unk_token
         self.pad_token = pad_token
         self.bos_token = bos_token