Delete tokenization_syncodonlm.py
Browse files- tokenization_syncodonlm.py +0 -39
tokenization_syncodonlm.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
from transformers import PreTrainedTokenizer
|
| 2 |
-
import json
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
class SynCodonLMTokenizer(PreTrainedTokenizer):
|
| 6 |
-
def __init__(self, vocab_file, unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]",
|
| 7 |
-
sep_token="[SEP]", mask_token="[MASK]", **kwargs):
|
| 8 |
-
super().__init__(unk_token=unk_token, pad_token=pad_token,
|
| 9 |
-
cls_token=cls_token, sep_token=sep_token,
|
| 10 |
-
mask_token=mask_token, **kwargs)
|
| 11 |
-
self.vocab = self._load_vocab(vocab_file)
|
| 12 |
-
self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
|
| 13 |
-
self.tokens_to_ids = self.vocab
|
| 14 |
-
self.vocab_file = vocab_file
|
| 15 |
-
|
| 16 |
-
def _load_vocab(self, vocab_file):
|
| 17 |
-
with open(vocab_file, "r") as f:
|
| 18 |
-
return json.load(f)
|
| 19 |
-
|
| 20 |
-
def _tokenize(self, text):
|
| 21 |
-
# Codon splitting logic
|
| 22 |
-
text = text.upper().replace("U", "T")
|
| 23 |
-
codons = [text[i:i+3] for i in range(0, len(text), 3)]
|
| 24 |
-
return codons
|
| 25 |
-
|
| 26 |
-
def convert_tokens_to_ids(self, tokens):
|
| 27 |
-
return [self.tokens_to_ids.get(token, self.tokens_to_ids.get(self.unk_token)) for token in tokens]
|
| 28 |
-
|
| 29 |
-
def convert_ids_to_tokens(self, ids):
|
| 30 |
-
return [self.ids_to_tokens.get(i, self.unk_token) for i in ids]
|
| 31 |
-
|
| 32 |
-
def get_vocab(self):
|
| 33 |
-
return self.vocab
|
| 34 |
-
|
| 35 |
-
def save_vocabulary(self, save_directory, filename_prefix=None):
|
| 36 |
-
vocab_path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json")
|
| 37 |
-
with open(vocab_path, "w") as f:
|
| 38 |
-
json.dump(self.vocab, f)
|
| 39 |
-
return (vocab_path,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|