mohsennp commited on
Commit
89ce8d5
·
verified ·
1 Parent(s): 59a04c6

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenization_decodon.py +31 -3
tokenization_decodon.py CHANGED
@@ -27,6 +27,30 @@ class DeCodonTokenizer(PreTrainedTokenizer):
27
  else:
28
  return ["".join(codon) for codon in product("ACGU", repeat=3)]
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def __init__(
31
  self,
32
  vocab_file=None,
@@ -49,13 +73,17 @@ class DeCodonTokenizer(PreTrainedTokenizer):
49
  with open(vocab_file, "r") as f:
50
  self.encoder = json.load(f)
51
  self.decoder = {i: k for k, i in self.encoder.items()}
 
 
 
 
52
  else:
53
  self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
54
  self.decoder = {i: k for k, i in self.encoder.items()}
55
 
56
- self.compiled_regex = re.compile(
57
- "|".join(self.codons + self.special_tokens + [r"\S"])
58
- )
59
 
60
  super().__init__(
61
  cls_token=cls_token,
 
27
  else:
28
  return ["".join(codon) for codon in product("ACGU", repeat=3)]
29
 
30
+ @classmethod
31
+ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
32
+ """
33
+ Instantiate a DeCodonTokenizer from a pre-trained tokenizer.
34
+ """
35
+ # Handle the case where we're loading from a local directory
36
+ if os.path.isdir(pretrained_model_name_or_path):
37
+ vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
38
+ if os.path.exists(vocab_file):
39
+ kwargs["vocab_file"] = vocab_file
40
+ else:
41
+ # For hub loading, try to get the vocab file from the cached download
42
+ from transformers.utils import cached_file
43
+ try:
44
+ vocab_file = cached_file(pretrained_model_name_or_path, "vocab.json")
45
+ if vocab_file:
46
+ kwargs["vocab_file"] = vocab_file
47
+ except Exception:
48
+ # If vocab.json is not found, continue without it (use default vocab)
49
+ pass
50
+
51
+ # Create instance with the vocab_file parameter
52
+ return cls(*inputs, **kwargs)
53
+
54
  def __init__(
55
  self,
56
  vocab_file=None,
 
73
  with open(vocab_file, "r") as f:
74
  self.encoder = json.load(f)
75
  self.decoder = {i: k for k, i in self.encoder.items()}
76
+
77
+ self.compiled_regex = re.compile(
78
+ "|".join(list(self.encoder.keys()) + [r"\S"])
79
+ )
80
  else:
81
  self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
82
  self.decoder = {i: k for k, i in self.encoder.items()}
83
 
84
+ self.compiled_regex = re.compile(
85
+ "|".join(self.codons + self.special_tokens + [r"\S"])
86
+ )
87
 
88
  super().__init__(
89
  cls_token=cls_token,