GenerTeam commited on
Commit
bce240d
·
verified ·
1 Parent(s): 3cf3a95

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +5 -6
tokenizer.py CHANGED
@@ -53,12 +53,11 @@ class DNAKmerTokenizer(PreTrainedTokenizer):
53
  "|".join(re.escape(token) for token in self.special_tokens)
54
  )
55
  self.dna_pattern = re.compile(f"[A-Z]{{{self.k}}}|[A-Z]+")
56
- self.bos_token = "<s>"
57
- self.eos_token = "</s>"
58
- self.mask_token = "<mask>"
59
- self.bos_token_id = self._convert_token_to_id(self.bos_token)
60
- self.eos_token_id = self._convert_token_to_id(self.eos_token)
61
- self.mask_token_id = self._convert_token_to_id(self.mask_token)
62
  super().__init__(**kwargs)
63
 
64
  @property
 
53
  "|".join(re.escape(token) for token in self.special_tokens)
54
  )
55
  self.dna_pattern = re.compile(f"[A-Z]{{{self.k}}}|[A-Z]+")
56
+ kwargs.setdefault("bos_token", "<s>")
57
+ kwargs.setdefault("eos_token", "</s>")
58
+ kwargs.setdefault("mask_token", "<mask>")
59
+ kwargs.setdefault("unk_token", "<oov>")
60
+ kwargs.setdefault("pad_token", "<pad>")
 
61
  super().__init__(**kwargs)
62
 
63
  @property