GenerTeam commited on
Commit
da5fa93
·
verified ·
1 Parent(s): 1d0509f

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +4 -4
tokenizer.py CHANGED
@@ -53,10 +53,10 @@ class DNAKmerTokenizer(PreTrainedTokenizer):
53
  "|".join(re.escape(token) for token in self.special_tokens)
54
  )
55
  self.dna_pattern = re.compile(f"[A-Z]{{{self.k}}}|[A-Z]+")
56
- self.bos_token = "<s>"
57
- self.eos_token = "</s>"
58
- self.bos_token_id = self._convert_token_to_id(self.bos_token)
59
- self.eos_token_id = self._convert_token_to_id(self.eos_token)
60
  super().__init__(**kwargs)
61
 
62
  @property
 
53
  "|".join(re.escape(token) for token in self.special_tokens)
54
  )
55
  self.dna_pattern = re.compile(f"[A-Z]{{{self.k}}}|[A-Z]+")
56
+ kwargs.setdefault("bos_token", "<s>")
57
+ kwargs.setdefault("eos_token", "</s>")
58
+ kwargs.setdefault("unk_token", "<oov>")
59
+ kwargs.setdefault("pad_token", "<pad>")
60
  super().__init__(**kwargs)
61
 
62
  @property