File size: 12,659 Bytes
0ba1d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ce8d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ba1d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ce8d5
 
 
 
0ba1d66
 
 
 
89ce8d5
 
 
0ba1d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import json
import os
import re
from transformers import PreTrainedTokenizer
from itertools import product

class DeCodonTokenizer(PreTrainedTokenizer):
    """
    DeCodonTokenizer Tokenizer: tokenize 3-mer codons into tokens
    The input sequences are expected to be raw sequences of coding DNA/RNA sequences.
    """

    SUPPORTED_TYPES = ["dna", "rna"]
    
    @staticmethod
    def get_all_codons(seq_type="dna"):
        """
        Get all possible codons.
        """
        seq_type = seq_type.lower()
        assert (
            seq_type in DeCodonTokenizer.SUPPORTED_TYPES
        ), f"seq_type should be either 'dna' or 'rna'. Got {seq_type}!"

        if seq_type == "dna":
            return ["".join(codon) for codon in product("ACGT", repeat=3)]
        else:
            return ["".join(codon) for codon in product("ACGU", repeat=3)]

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        """
        Instantiate a DeCodonTokenizer from a pre-trained tokenizer.
        """
        # Handle the case where we're loading from a local directory
        if os.path.isdir(pretrained_model_name_or_path):
            vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
            if os.path.exists(vocab_file):
                kwargs["vocab_file"] = vocab_file
        else:
            # For hub loading, try to get the vocab file from the cached download
            from transformers.utils import cached_file
            try:
                vocab_file = cached_file(pretrained_model_name_or_path, "vocab.json")
                if vocab_file:
                    kwargs["vocab_file"] = vocab_file
            except Exception:
                # If vocab.json is not found, continue without it (use default vocab)
                pass
        
        # Create instance with the vocab_file parameter
        return cls(*inputs, **kwargs)

    def __init__(
        self,
        vocab_file=None,
        cls_token="<CLS>",
        bos_token="<CLS>",
        sep_token="<SEP>",
        unk_token="<UNK>",
        pad_token="<PAD>",
        mask_token="<MASK>",
        seq_type="dna",
        **kwargs,
    ):
        self.codons = self.get_all_codons(seq_type=seq_type)
        self.seq_type = seq_type
        self.special_tokens = [cls_token, sep_token, unk_token, pad_token, mask_token]
        self.special_tokens = [str(token) for token in self.special_tokens]
        
        if vocab_file is not None:
            import json
            with open(vocab_file, "r") as f:
                self.encoder = json.load(f)
                self.decoder = {i: k for k, i in self.encoder.items()}
            
            self.compiled_regex = re.compile(
                "|".join(list(self.encoder.keys()) + [r"\S"])
            )
        else:
            self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
            self.decoder = {i: k for k, i in self.encoder.items()}
        
            self.compiled_regex = re.compile(
                "|".join(self.codons + self.special_tokens + [r"\S"])
            )

        super().__init__(
            cls_token=cls_token,
            bos_token=bos_token,
            sep_token=sep_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )

        self.aa_to_codon = {
            "A": ["GCT", "GCC", "GCA", "GCG"],
            "C": ["TGT", "TGC"],
            "D": ["GAT", "GAC"],
            "E": ["GAA", "GAG"],
            "F": ["TTT", "TTC"],
            "G": ["GGT", "GGC", "GGA", "GGG"],
            "H": ["CAT", "CAC"],
            "I": ["ATT", "ATC", "ATA"],
            "K": ["AAA", "AAG"],
            "L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
            "M": ["ATG"],
            "N": ["AAT", "AAC"],
            "P": ["CCT", "CCC", "CCA", "CCG"],
            "Q": ["CAA", "CAG"],
            "R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
            "S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
            "T": ["ACT", "ACC", "ACA", "ACG"],
            "V": ["GTT", "GTC", "GTA", "GTG"],
            "W": ["TGG"],
            "Y": ["TAT", "TAC"],
            "*": ["TAA", "TAG", "TGA"],
        }
        self.codon_to_aa = {
            codon: aa for aa, codons in self.aa_to_codon.items() for codon in codons
        }

        if seq_type == "rna":
            self.aa_to_codon = {
                k: [c.replace("T", "U") for c in v] for k, v in self.aa_to_codon.items()
            }
            self.codon_to_aa = {
                k.replace("T", "U"): v for k, v in self.codon_to_aa.items()
            }

        self.amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
        self.encoder_aa = {
            k: i for i, k in enumerate(self.special_tokens + self.amino_acids)
        }
        self.compiled_regex_aa = re.compile(
            "|".join(self.amino_acids + self.special_tokens + [r"\S"])
        )

        self.token_type_mode = kwargs.get("token_type_mode", "regular")
        self.build_token_type_encoder()
        
    def set_organism_tokens(self, organism_tokens):
        """
        Add organism tokens to the tokenizer.
        """
        vocab_size = len(self.encoder)
        for i, token in enumerate(organism_tokens):
            self.encoder[token] = vocab_size + i
            self.decoder[vocab_size + i] = token

        self.organism_tokens = organism_tokens
        self.compiled_regex = re.compile(
            "|".join(self.codons + self.special_tokens + organism_tokens + [r"\S"])
        )

    @property
    def vocab_size(self):
        return len(self.encoder)

    def build_token_type_encoder(self):
        if self.token_type_mode == "aa":
            # build a token type encoder for amino acids with codon ids as keys and amino acid ids as values
            # CLS, SEP, UNK, MASK, PAD tokens are assigned to the same token type as zero
            token_type_encoder = {}
            for token, token_id in self.encoder.items():
                if token in self.special_tokens:
                    token_type_encoder[token_id] = 0
                elif token in self.codons:
                    aa = self.codon_to_aa[token]
                    token_type_encoder[token_id] = (
                        list(self.amino_acids + ["*"]).index(aa) + 1
                    )
                else:
                    token_type_encoder[token_id] = len(self.amino_acids) + 2
        elif self.token_type_mode == "regular":
            # build a token type encoder for regular tokens
            token_type_encoder = {token_id: 0 for token_id in self.encoder.values()}
        elif self.token_type_mode == "regular_special":
            # build a token type encoder for regular tokens with special tokens having a different but same token type
            token_type_encoder = {
                token_id: 0 if token in self.special_tokens else 1
                for token, token_id in self.encoder.items()
            }
        else:
            raise ValueError(f"Unknown token type mode: {self.token_type_mode}")

        self.token_type_encoder = token_type_encoder

    @property
    def token_type_vocab_size(self):
        return len(set(self.token_type_encoder.values())) + 1

    def get_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    def _tokenize(self, text):
        """
        Tokenize a string.
        """
        text = text.upper()
        tokens = self.compiled_regex.findall(text)
        return tokens

    def _convert_token_to_id(self, token):
        """
        Converts a token (str) in an id using the vocab.
        """
        return self.encoder.get(token, self.encoder[self.unk_token])

    def _convert_id_to_token(self, index):
        """
        Converts an index (integer) in a token (str) using the vocab.
        """
        return self.decoder.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """
        Converts a sequence of tokens (string) in a single string.
        """
        return "".join(tokens)

    def encode_aa(self, text):
        """
        Encode a DNA/RNA string using the amino acid vocab.
        """
        tokens = self._tokenize(text)
        return [
            self.encoder_aa.get(token, self.encoder_aa[self.unk_token])
            for token in tokens
        ]

    def get_aa_vocab_size(self):
        return len(self.encoder_aa)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.

        This implementation does not add special tokens and this method should be overridden in a subclass.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.

        Returns:
            `List[int]`: The model input with special tokens.
        """
        token_ids_0 = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        return token_ids_0

    def get_special_tokens_mask(
        self, token_ids_0, token_ids_1=None, already_has_special_tokens: bool = False
    ):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        special_ids = [
            self.eos_token_id,
            self.pad_token_id,
            self.mask_token_id,
            self.sep_token_id,
            self.cls_token_id,
        ]

        if already_has_special_tokens:
            special_tokens_mask = [
                1 if idx in special_ids else 0 for idx in token_ids_0
            ]
        else:
            special_tokens_mask = (
                [1] + [1 if idx in special_ids else 0 for idx in token_ids_0] + [1]
            )

        return special_tokens_mask

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Create the token type IDs corresponding to the sequences passed. [What are token type
        IDs?](../glossary#token-type-ids)

        Should be overridden in a subclass if the model has a special way of building those.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.

        Returns:
            `List[int]`: The token type ids.
        """
        # special_ids = [
        #     self.bos_token_id,
        #     self.eos_token_id,
        #     self.pad_token_id,
        #     self.mask_token_id,
        #     self.cls_token_id,
        #     self.sep_token_id,
        # ]

        # token_type_ids = [0] + [0 for idx in token_ids_0] + [0]

        unk_type_id = len(set(self.token_type_encoder.values()))

        token_type_ids = [
            self.token_type_encoder.get(token_id, unk_type_id)
            for token_id in token_ids_0
        ]

        return token_type_ids

    def save_vocabulary(self, save_directory, filename_prefix=None):
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

        This method won't save the configuration and special token mappings of the tokenizer. Use
        [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        if filename_prefix is None:
            filename_prefix = ""
            
        vocab_file = os.path.join(save_directory, filename_prefix + "vocab.json")

        with open(vocab_file, "w") as f:
            json.dump(self.encoder, f)

        return (vocab_file,)