mohsennp commited on
Commit
0873c46
·
verified ·
1 Parent(s): 484fbd6

Upload tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<CLS>",
3
+ "cls_token": "<CLS>",
4
+ "mask_token": "<MASK>",
5
+ "pad_token": "<PAD>",
6
+ "sep_token": "<SEP>",
7
+ "unk_token": "<UNK>"
8
+ }
tokenization_encodon.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ from itertools import product
6
+ from transformers import PreTrainedTokenizer
7
+
8
+
9
+ class EnCodonTokenizer(PreTrainedTokenizer):
10
+ """
11
+ EnCodon Tokenizer: tokenize 3-mer codons into tokens
12
+ The input sequences are expected to be raw sequences of coding DNA/RNA sequences.
13
+ """
14
+
15
+ SUPPORTED_TYPES = ["dna", "rna"]
16
+
17
+ @staticmethod
18
+ def get_all_codons(seq_type="dna"):
19
+ """
20
+ Get all possible codons.
21
+ """
22
+ seq_type = seq_type.lower()
23
+ assert (
24
+ seq_type in EnCodonTokenizer.SUPPORTED_TYPES
25
+ ), f"seq_type should be either 'dna' or 'rna'. Got {seq_type}!"
26
+
27
+ if seq_type == "dna":
28
+ return ["".join(codon) for codon in product("ACGT", repeat=3)]
29
+ else:
30
+ return ["".join(codon) for codon in product("ACGU", repeat=3)]
31
+
32
+ def __init__(
33
+ self,
34
+ cls_token="<CLS>",
35
+ bos_token="<CLS>",
36
+ sep_token="<SEP>",
37
+ unk_token="<UNK>",
38
+ pad_token="<PAD>",
39
+ mask_token="<MASK>",
40
+ seq_type="dna",
41
+ **kwargs,
42
+ ):
43
+ self.codons = self.get_all_codons(seq_type=seq_type)
44
+ self.seq_type = seq_type
45
+ self.special_tokens = [cls_token, sep_token, unk_token, pad_token, mask_token]
46
+
47
+ self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
48
+ self.decoder = {i: k for k, i in self.encoder.items()}
49
+ self.compiled_regex = re.compile(
50
+ "|".join(self.codons + self.special_tokens + [r"\S"])
51
+ )
52
+
53
+ super().__init__(
54
+ cls_token=cls_token,
55
+ bos_token=bos_token,
56
+ sep_token=sep_token,
57
+ unk_token=unk_token,
58
+ pad_token=pad_token,
59
+ mask_token=mask_token,
60
+ **kwargs,
61
+ )
62
+
63
+ self.aa_to_codon = {
64
+ "A": ["GCT", "GCC", "GCA", "GCG"],
65
+ "C": ["TGT", "TGC"],
66
+ "D": ["GAT", "GAC"],
67
+ "E": ["GAA", "GAG"],
68
+ "F": ["TTT", "TTC"],
69
+ "G": ["GGT", "GGC", "GGA", "GGG"],
70
+ "H": ["CAT", "CAC"],
71
+ "I": ["ATT", "ATC", "ATA"],
72
+ "K": ["AAA", "AAG"],
73
+ "L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
74
+ "M": ["ATG"],
75
+ "N": ["AAT", "AAC"],
76
+ "P": ["CCT", "CCC", "CCA", "CCG"],
77
+ "Q": ["CAA", "CAG"],
78
+ "R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
79
+ "S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
80
+ "T": ["ACT", "ACC", "ACA", "ACG"],
81
+ "V": ["GTT", "GTC", "GTA", "GTG"],
82
+ "W": ["TGG"],
83
+ "Y": ["TAT", "TAC"],
84
+ "*": ["TAA", "TAG", "TGA"],
85
+ }
86
+ self.codon_to_aa = {
87
+ codon: aa for aa, codons in self.aa_to_codon.items() for codon in codons
88
+ }
89
+
90
+ if seq_type == "rna":
91
+ self.aa_to_codon = {
92
+ k: [c.replace("T", "U") for c in v] for k, v in self.aa_to_codon.items()
93
+ }
94
+ self.codon_to_aa = {
95
+ k.replace("T", "U"): v for k, v in self.codon_to_aa.items()
96
+ }
97
+
98
+ self.amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
99
+ self.encoder_aa = {
100
+ k: i for i, k in enumerate(self.special_tokens + self.amino_acids)
101
+ }
102
+ self.compiled_regex_aa = re.compile(
103
+ "|".join(self.amino_acids + self.special_tokens + [r"\S"])
104
+ )
105
+
106
+ self.token_type_mode = kwargs.get("token_type_mode", "regular")
107
+ self.build_token_type_encoder()
108
+
109
+ @property
110
+ def vocab_size(self):
111
+ return len(self.encoder)
112
+
113
+ def build_token_type_encoder(self):
114
+ if self.token_type_mode == "aa":
115
+ # build a token type encoder for amino acids with codon ids as keys and amino acid ids as values
116
+ # CLS, SEP, UNK, MASK, PAD tokens are assigned to the same token type as zero
117
+ token_type_encoder = {}
118
+ for token, token_id in self.encoder.items():
119
+ if token in self.special_tokens:
120
+ token_type_encoder[token_id] = 0
121
+ elif token in self.codons:
122
+ aa = self.codon_to_aa[token]
123
+ token_type_encoder[token_id] = (
124
+ list(self.amino_acids + ["*"]).index(aa) + 1
125
+ )
126
+ else:
127
+ token_type_encoder[token_id] = len(self.amino_acids) + 2
128
+ elif self.token_type_mode == "regular":
129
+ # build a token type encoder for regular tokens
130
+ token_type_encoder = {token_id: 0 for token_id in self.encoder.values()}
131
+ elif self.token_type_mode == "regular_special":
132
+ # build a token type encoder for regular tokens with special tokens having a different but same token type
133
+ token_type_encoder = {
134
+ token_id: 0 if token in self.special_tokens else 1
135
+ for token, token_id in self.encoder.items()
136
+ }
137
+ else:
138
+ raise ValueError(f"Unknown token type mode: {self.token_type_mode}")
139
+
140
+ self.token_type_encoder = token_type_encoder
141
+
142
+ @property
143
+ def token_type_vocab_size(self):
144
+ return len(set(self.token_type_encoder.values())) + 1
145
+
146
+ def get_vocab(self):
147
+ return dict(self.encoder, **self.added_tokens_encoder)
148
+
149
+ def _tokenize(self, text):
150
+ """
151
+ Tokenize a string.
152
+ """
153
+ text = text.upper()
154
+ tokens = self.compiled_regex.findall(text)
155
+ return tokens
156
+
157
+ def _convert_token_to_id(self, token):
158
+ """
159
+ Converts a token (str) in an id using the vocab.
160
+ """
161
+ return self.encoder.get(token, self.encoder[self.unk_token])
162
+
163
+ def _convert_id_to_token(self, index):
164
+ """
165
+ Converts an index (integer) in a token (str) using the vocab.
166
+ """
167
+ return self.decoder.get(index, self.unk_token)
168
+
169
+ def convert_tokens_to_string(self, tokens):
170
+ """
171
+ Converts a sequence of tokens (string) in a single string.
172
+ """
173
+ return "".join(tokens)
174
+
175
+ def encode_aa(self, text):
176
+ """
177
+ Encode a DNA/RNA string using the amino acid vocab.
178
+ """
179
+ tokens = self._tokenize(text)
180
+ return [
181
+ self.encoder_aa.get(token, self.encoder_aa[self.unk_token])
182
+ for token in tokens
183
+ ]
184
+
185
+ def get_aa_vocab_size(self):
186
+ return len(self.encoder_aa)
187
+
188
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
189
+ """
190
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
191
+ adding special tokens.
192
+
193
+ This implementation does not add special tokens and this method should be overridden in a subclass.
194
+
195
+ Args:
196
+ token_ids_0 (`List[int]`): The first tokenized sequence.
197
+ token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
198
+
199
+ Returns:
200
+ `List[int]`: The model input with special tokens.
201
+ """
202
+ token_ids_0 = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
203
+ return token_ids_0
204
+
205
+ def get_special_tokens_mask(
206
+ self, token_ids_0, token_ids_1=None, already_has_special_tokens: bool = False
207
+ ):
208
+ """
209
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
210
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
211
+
212
+ Args:
213
+ token_ids_0 (`List[int]`):
214
+ List of ids of the first sequence.
215
+ token_ids_1 (`List[int]`, *optional*):
216
+ List of ids of the second sequence.
217
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
218
+ Whether or not the token list is already formatted with special tokens for the model.
219
+
220
+ Returns:
221
+ A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
222
+ """
223
+ special_ids = [
224
+ self.pad_token_id,
225
+ self.mask_token_id,
226
+ self.sep_token_id,
227
+ self.cls_token_id,
228
+ ]
229
+
230
+ if already_has_special_tokens:
231
+ special_tokens_mask = [
232
+ 1 if idx in special_ids else 0 for idx in token_ids_0
233
+ ]
234
+ else:
235
+ special_tokens_mask = (
236
+ [1] + [1 if idx in special_ids else 0 for idx in token_ids_0] + [1]
237
+ )
238
+
239
+ return special_tokens_mask
240
+
241
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
242
+ """
243
+ Create the token type IDs corresponding to the sequences passed. [What are token type
244
+ IDs?](../glossary#token-type-ids)
245
+
246
+ Should be overridden in a subclass if the model has a special way of building those.
247
+
248
+ Args:
249
+ token_ids_0 (`List[int]`): The first tokenized sequence.
250
+ token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
251
+
252
+ Returns:
253
+ `List[int]`: The token type ids.
254
+ """
255
+ unk_type_id = len(set(self.token_type_encoder.values()))
256
+
257
+ token_type_ids = [
258
+ self.token_type_encoder.get(token_id, unk_type_id)
259
+ for token_id in token_ids_0
260
+ ]
261
+
262
+ return token_type_ids
263
+
264
+ def save_vocabulary(self, save_directory, filename_prefix=None):
265
+ """
266
+ Save only the vocabulary of the tokenizer (vocabulary + added tokens).
267
+
268
+ This method won't save the configuration and special token mappings of the tokenizer. Use
269
+ [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
270
+
271
+ Args:
272
+ save_directory (`str`):
273
+ The directory in which to save the vocabulary.
274
+ filename_prefix (`str`, *optional*):
275
+ An optional prefix to add to the named of the saved files.
276
+
277
+ Returns:
278
+ `Tuple(str)`: Paths to the files saved.
279
+ """
280
+ if filename_prefix is None:
281
+ filename_prefix = ""
282
+
283
+ vocab_file = os.path.join(save_directory, filename_prefix + "vocab.json")
284
+
285
+ with open(vocab_file, "w") as f:
286
+ json.dump(self.encoder, f)
287
+
288
+ return (vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<CLS>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<SEP>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<UNK>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<PAD>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<MASK>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenization_encodon.EnCodonTokenizer",
47
+ null
48
+ ]
49
+ },
50
+ "bos_token": "<CLS>",
51
+ "clean_up_tokenization_spaces": true,
52
+ "cls_token": "<CLS>",
53
+ "mask_token": "<MASK>",
54
+ "model_max_length": 2048,
55
+ "pad_token": "<PAD>",
56
+ "sep_token": "<SEP>",
57
+ "tokenizer_class": "EnCodonTokenizer",
58
+ "unk_token": "<UNK>"
59
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<CLS>": 0, "<SEP>": 1, "<UNK>": 2, "<PAD>": 3, "<MASK>": 4, "AAA": 5, "AAC": 6, "AAG": 7, "AAT": 8, "ACA": 9, "ACC": 10, "ACG": 11, "ACT": 12, "AGA": 13, "AGC": 14, "AGG": 15, "AGT": 16, "ATA": 17, "ATC": 18, "ATG": 19, "ATT": 20, "CAA": 21, "CAC": 22, "CAG": 23, "CAT": 24, "CCA": 25, "CCC": 26, "CCG": 27, "CCT": 28, "CGA": 29, "CGC": 30, "CGG": 31, "CGT": 32, "CTA": 33, "CTC": 34, "CTG": 35, "CTT": 36, "GAA": 37, "GAC": 38, "GAG": 39, "GAT": 40, "GCA": 41, "GCC": 42, "GCG": 43, "GCT": 44, "GGA": 45, "GGC": 46, "GGG": 47, "GGT": 48, "GTA": 49, "GTC": 50, "GTG": 51, "GTT": 52, "TAA": 53, "TAC": 54, "TAG": 55, "TAT": 56, "TCA": 57, "TCC": 58, "TCG": 59, "TCT": 60, "TGA": 61, "TGC": 62, "TGG": 63, "TGT": 64, "TTA": 65, "TTC": 66, "TTG": 67, "TTT": 68}