jheuschkel commited on
Commit
3c4d9a5
·
verified ·
1 Parent(s): 6e85141

Delete tokenization_syncodonlm.py

Browse files
Files changed (1) hide show
  1. tokenization_syncodonlm.py +0 -39
tokenization_syncodonlm.py DELETED
@@ -1,39 +0,0 @@
1
- from transformers import PreTrainedTokenizer
2
- import json
3
- import os
4
-
5
- class SynCodonLMTokenizer(PreTrainedTokenizer):
6
- def __init__(self, vocab_file, unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]",
7
- sep_token="[SEP]", mask_token="[MASK]", **kwargs):
8
- super().__init__(unk_token=unk_token, pad_token=pad_token,
9
- cls_token=cls_token, sep_token=sep_token,
10
- mask_token=mask_token, **kwargs)
11
- self.vocab = self._load_vocab(vocab_file)
12
- self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
13
- self.tokens_to_ids = self.vocab
14
- self.vocab_file = vocab_file
15
-
16
- def _load_vocab(self, vocab_file):
17
- with open(vocab_file, "r") as f:
18
- return json.load(f)
19
-
20
- def _tokenize(self, text):
21
- # Codon splitting logic
22
- text = text.upper().replace("U", "T")
23
- codons = [text[i:i+3] for i in range(0, len(text), 3)]
24
- return codons
25
-
26
- def convert_tokens_to_ids(self, tokens):
27
- return [self.tokens_to_ids.get(token, self.tokens_to_ids.get(self.unk_token)) for token in tokens]
28
-
29
- def convert_ids_to_tokens(self, ids):
30
- return [self.ids_to_tokens.get(i, self.unk_token) for i in ids]
31
-
32
- def get_vocab(self):
33
- return self.vocab
34
-
35
- def save_vocabulary(self, save_directory, filename_prefix=None):
36
- vocab_path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json")
37
- with open(vocab_path, "w") as f:
38
- json.dump(self.vocab, f)
39
- return (vocab_path,)