saracandu commited on
Commit
55951b4
·
verified ·
1 Parent(s): 886c8fb

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.py +12 -3
tokenizer.py CHANGED
@@ -8,8 +8,17 @@ import json
8
 
9
  logger = logging.get_logger(__name__)
10
 
 
 
 
 
 
 
 
 
 
11
 
12
- def load_json(path: str) -> Union[Dict, List]:
13
  """
14
  Load a JSON file from the given path.
15
  Args:
@@ -30,7 +39,7 @@ class STLTokenizer(PreTrainedTokenizer):
30
  and handle padding and special tokens.
31
  """
32
 
33
- def __init__(self, vocab_path: str = 'vocab.json', unk_token: str = "unk", pad_token: str = "pad",
34
  bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
35
  """
36
  Initializes the STLTokenizer with a given vocabulary and special tokens.
@@ -41,7 +50,7 @@ class STLTokenizer(PreTrainedTokenizer):
41
  bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
42
  eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
43
  """
44
- self.vocab = load_json(vocab_path)
45
  self.unk_token = unk_token
46
  self.pad_token = pad_token
47
  self.bos_token = bos_token
 
8
 
9
  logger = logging.get_logger(__name__)
10
 
11
+ from huggingface_hub import hf_hub_download
12
+ import json
13
+ import os
14
+
15
+ def load_json(path, repo_id=None):
16
+ if repo_id:
17
+ path = hf_hub_download(repo_id, path)
18
+ with open(path, "r", encoding="utf-8") as f:
19
+ return json.load(f)
20
 
21
+ def load_json_old(path: str) -> Union[Dict, List]:
22
  """
23
  Load a JSON file from the given path.
24
  Args:
 
39
  and handle padding and special tokens.
40
  """
41
 
42
+ def __init__(self, vocab_path: str = "vocab.json", unk_token: str = "unk", pad_token: str = "pad",
43
  bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
44
  """
45
  Initializes the STLTokenizer with a given vocabulary and special tokens.
 
50
  bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
51
  eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
52
  """
53
+ self.vocab = load_json("vocab.json", repo_id="saracandu/stldec_random_16")
54
  self.unk_token = unk_token
55
  self.pad_token = pad_token
56
  self.bos_token = bos_token