nioushasadjadi
commited on
Commit
·
bcd9e56
1
Parent(s):
82681b6
Fixing tokenizer return type
Browse files- tokenizer.py +2 -2
tokenizer.py
CHANGED
|
@@ -26,14 +26,14 @@ class KmerTokenizer(PreTrainedTokenizer):
|
|
| 26 |
self.unk_token = "[UNK]"
|
| 27 |
# self.pad_token = "[PAD]"
|
| 28 |
|
| 29 |
-
def
|
| 30 |
splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
|
| 31 |
if kwargs.get('return_tensors') == 'pt':
|
| 32 |
return torch.tensor(splits)
|
| 33 |
return splits
|
| 34 |
|
| 35 |
def _encode(self, text, **kwargs):
|
| 36 |
-
tokens = self.
|
| 37 |
token_ids = self.convert_tokens_to_ids(tokens)
|
| 38 |
if kwargs.get('return_tensors') == 'pt':
|
| 39 |
return torch.tensor(token_ids)
|
|
|
|
| 26 |
self.unk_token = "[UNK]"
|
| 27 |
# self.pad_token = "[PAD]"
|
| 28 |
|
| 29 |
+
def tokenize(self, text, **kwargs):
|
| 30 |
splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
|
| 31 |
if kwargs.get('return_tensors') == 'pt':
|
| 32 |
return torch.tensor(splits)
|
| 33 |
return splits
|
| 34 |
|
| 35 |
def _encode(self, text, **kwargs):
|
| 36 |
+
tokens = self.tokenize(text, **kwargs)
|
| 37 |
token_ids = self.convert_tokens_to_ids(tokens)
|
| 38 |
if kwargs.get('return_tensors') == 'pt':
|
| 39 |
return torch.tensor(token_ids)
|