Upload tokenizer
Browse files- tokenization_decodon.py +31 -3
tokenization_decodon.py
CHANGED
|
@@ -27,6 +27,30 @@ class DeCodonTokenizer(PreTrainedTokenizer):
|
|
| 27 |
else:
|
| 28 |
return ["".join(codon) for codon in product("ACGU", repeat=3)]
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def __init__(
|
| 31 |
self,
|
| 32 |
vocab_file=None,
|
|
@@ -49,13 +73,17 @@ class DeCodonTokenizer(PreTrainedTokenizer):
|
|
| 49 |
with open(vocab_file, "r") as f:
|
| 50 |
self.encoder = json.load(f)
|
| 51 |
self.decoder = {i: k for k, i in self.encoder.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
else:
|
| 53 |
self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
|
| 54 |
self.decoder = {i: k for k, i in self.encoder.items()}
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
super().__init__(
|
| 61 |
cls_token=cls_token,
|
|
|
|
| 27 |
else:
|
| 28 |
return ["".join(codon) for codon in product("ACGU", repeat=3)]
|
| 29 |
|
| 30 |
+
@classmethod
|
| 31 |
+
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
| 32 |
+
"""
|
| 33 |
+
Instantiate a DeCodonTokenizer from a pre-trained tokenizer.
|
| 34 |
+
"""
|
| 35 |
+
# Handle the case where we're loading from a local directory
|
| 36 |
+
if os.path.isdir(pretrained_model_name_or_path):
|
| 37 |
+
vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
|
| 38 |
+
if os.path.exists(vocab_file):
|
| 39 |
+
kwargs["vocab_file"] = vocab_file
|
| 40 |
+
else:
|
| 41 |
+
# For hub loading, try to get the vocab file from the cached download
|
| 42 |
+
from transformers.utils import cached_file
|
| 43 |
+
try:
|
| 44 |
+
vocab_file = cached_file(pretrained_model_name_or_path, "vocab.json")
|
| 45 |
+
if vocab_file:
|
| 46 |
+
kwargs["vocab_file"] = vocab_file
|
| 47 |
+
except Exception:
|
| 48 |
+
# If vocab.json is not found, continue without it (use default vocab)
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
# Create instance with the vocab_file parameter
|
| 52 |
+
return cls(*inputs, **kwargs)
|
| 53 |
+
|
| 54 |
def __init__(
|
| 55 |
self,
|
| 56 |
vocab_file=None,
|
|
|
|
| 73 |
with open(vocab_file, "r") as f:
|
| 74 |
self.encoder = json.load(f)
|
| 75 |
self.decoder = {i: k for k, i in self.encoder.items()}
|
| 76 |
+
|
| 77 |
+
self.compiled_regex = re.compile(
|
| 78 |
+
"|".join(list(self.encoder.keys()) + [r"\S"])
|
| 79 |
+
)
|
| 80 |
else:
|
| 81 |
self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
|
| 82 |
self.decoder = {i: k for k, i in self.encoder.items()}
|
| 83 |
|
| 84 |
+
self.compiled_regex = re.compile(
|
| 85 |
+
"|".join(self.codons + self.special_tokens + [r"\S"])
|
| 86 |
+
)
|
| 87 |
|
| 88 |
super().__init__(
|
| 89 |
cls_token=cls_token,
|