FreakingPotato commited on
Commit
10234c4
·
1 Parent(s): f39c34e

Upload RNAElectra pretrained model weights and tokenizer

Browse files
Files changed (7) hide show
  1. README.md +32 -0
  2. config.json +46 -0
  3. pytorch_model.bin +3 -0
  4. special_tokens_map.json +9 -0
  5. tokenizer.py +236 -0
  6. tokenizer_config.json +71 -0
  7. vocab.json +29 -0
README.md CHANGED
@@ -1,3 +1,35 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+ # RNAElectra
5
+
6
+ RNAElectra is a pretrained RNA language model for nucleotide-level sequence representation learning.
7
+
8
+ ## Load model
9
+
10
+ ```python
11
+ import torch
12
+ from transformers import AutoModel
13
+ from tokenizer import NucEL_Tokenizer
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ model = AutoModel.from_pretrained(
18
+ "FreakingPotato/RNAElectra",
19
+ trust_remote_code=True
20
+ ).to(device)
21
+
22
+ tokenizer = NucEL_Tokenizer.from_pretrained(
23
+ "FreakingPotato/RNAElectra",
24
+ trust_remote_code=True
25
+ )
26
+
27
+ sequence = "AUGCAUGCAUGCAUGC"
28
+ inputs = tokenizer(sequence, return_tensors="pt")
29
+ inputs = {k: v.to(device) for k, v in inputs.items()}
30
+
31
+ with torch.no_grad():
32
+ outputs = model(**inputs)
33
+
34
+ embeddings = outputs.last_hidden_state
35
+ print(embeddings.shape)
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "cls",
12
+ "cls_token_id": 2,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "float32",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 10000,
20
+ "hidden_activation": "gelu",
21
+ "hidden_size": 512,
22
+ "initializer_cutoff_factor": 2.0,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 2048,
25
+ "layer_norm_eps": 1e-12,
26
+ "local_attention": 128,
27
+ "local_rope_theta": 1000,
28
+ "mask_token_id": 3,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-12,
35
+ "num_attention_heads": 16,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 1,
38
+ "repad_logits_with_grad": false,
39
+ "sep_token_id": 50282,
40
+ "sparse_pred_ignore_index": -100,
41
+ "sparse_prediction": false,
42
+ "tie_word_embeddings": false,
43
+ "transformers_version": "4.57.3",
44
+ "unknown_token_id": 0,
45
+ "vocab_size": 27
46
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c477cad751b23b02b49fbc1dd7e4339fc74191ca082bd5f05eb20d71bf385dc
3
+ size 369289915
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[EOS]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tokenizer.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Optional, Union, Any, Tuple
2
+ import os
3
+ from transformers import PreTrainedTokenizer
4
+ from itertools import product
5
+ import json
6
+
7
+ class NucEL_Tokenizer(PreTrainedTokenizer):
8
+ """
9
+ KMER Tokenizer for DNA sequences, inheriting from Hugging Face's PreTrainedTokenizer.
10
+ Handles k-mer tokenization with support for special tokens, padding, and truncation.
11
+ """
12
+
13
+ model_input_names = ["input_ids", "attention_mask"]
14
+
15
+ def __init__(
16
+ self,
17
+ k: int = 6,
18
+ model_max_length: int = 2048,
19
+ pad_token: str = "[PAD]",
20
+ unk_token: str = "[UNK]",
21
+ sep_token: str = "[SEP]",
22
+ cls_token: str = "[CLS]",
23
+ mask_token: str = "[MASK]",
24
+ bos_token: str = "[BOS]",
25
+ eos_token: str = "[EOS]",
26
+ num_reserved_tokens: int = 16,
27
+ **kwargs
28
+ ):
29
+ """Initialize the KMER tokenizer."""
30
+ self.k = k
31
+ self.nucleotides = ['A', 'C', 'G', 'T']
32
+ self.num_reserved_tokens = num_reserved_tokens
33
+
34
+ # Define special tokens
35
+ self.special_tokens = {
36
+ "pad_token": pad_token,
37
+ "unk_token": unk_token,
38
+ "sep_token": sep_token,
39
+ "cls_token": cls_token,
40
+ "mask_token": mask_token,
41
+ "bos_token": bos_token,
42
+ "eos_token": eos_token,
43
+ }
44
+
45
+ # Build vocabulary (includes special tokens, nucleotides, and k-mers)
46
+ self._init_vocabulary()
47
+
48
+ # Now initialize the parent class.
49
+ super().__init__(
50
+ model_max_length=model_max_length,
51
+ pad_token=pad_token,
52
+ unk_token=unk_token,
53
+ sep_token=sep_token,
54
+ cls_token=cls_token,
55
+ mask_token=mask_token,
56
+ bos_token=bos_token,
57
+ eos_token=eos_token,
58
+ **kwargs
59
+ )
60
+
61
+ def _init_vocabulary(self):
62
+ """Initialize the vocabulary with special tokens, nucleotides, and k-mers."""
63
+ # Get special tokens in a specific order
64
+ special_tokens = [
65
+ self.special_tokens["pad_token"],
66
+ self.special_tokens["unk_token"],
67
+ self.special_tokens["cls_token"],
68
+ self.special_tokens["sep_token"],
69
+ self.special_tokens["mask_token"],
70
+ self.special_tokens["bos_token"],
71
+ self.special_tokens["eos_token"]
72
+ ]
73
+
74
+ # Add individual nucleotides
75
+ nucleotides = self.nucleotides
76
+
77
+ # Generate all possible k-mers
78
+ kmers = [''.join(p) for p in product(self.nucleotides, repeat=self.k)]
79
+
80
+ # Add reserved tokens for future use
81
+ reserved_tokens = [f"[RESERVED_{i}]" for i in range(self.num_reserved_tokens)]
82
+
83
+ # Combine all tokens in a specific order
84
+ all_tokens = special_tokens + nucleotides + kmers + reserved_tokens
85
+
86
+ # Create vocabulary: token -> index
87
+ self.vocab = {}
88
+ for idx, token in enumerate(all_tokens):
89
+ self.vocab[token] = idx
90
+
91
+ # Create reverse mapping: index -> token
92
+ self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
93
+
94
+ @property
95
+ def vocab_size(self) -> int:
96
+ """Return the size of vocabulary."""
97
+ return len(self.vocab)
98
+
99
+ def get_vocab(self) -> Dict[str, int]:
100
+ """Return the vocabulary dictionary."""
101
+ return self.vocab.copy()
102
+
103
+ def _tokenize(self, text: str) -> List[str]:
104
+ """
105
+ Tokenize a DNA sequence into k-mers and individual nucleotides.
106
+
107
+ Args:
108
+ text: DNA sequence to tokenize
109
+
110
+ Returns:
111
+ List of tokens.
112
+ """
113
+ text = text.upper().strip()
114
+ tokens = [self.cls_token]
115
+ i = 0
116
+
117
+ while i < len(text):
118
+ # Try to get a k-mer
119
+ if i <= len(text) - self.k:
120
+ kmer = text[i:i+self.k]
121
+ if kmer in self.vocab:
122
+ tokens.append(kmer)
123
+ i += self.k
124
+ continue
125
+
126
+ # Fallback: tokenize a single nucleotide
127
+ if i < len(text):
128
+ nucleotide = text[i]
129
+ if nucleotide in self.nucleotides:
130
+ tokens.append(nucleotide)
131
+ else:
132
+ tokens.append(self.unk_token)
133
+ i += 1
134
+
135
+ return tokens
136
+
137
+ def _convert_token_to_id(self, token: str) -> int:
138
+ """Convert a token to its ID in the vocabulary."""
139
+ return self.vocab.get(token, self.vocab[self.unk_token])
140
+
141
+ def _convert_id_to_token(self, index: int) -> str:
142
+ """Convert an ID to its token in the vocabulary."""
143
+ return self.ids_to_tokens.get(index, self.unk_token)
144
+
145
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """Save the tokenizer vocabulary to a directory."""
147
+ if not filename_prefix:
148
+ filename_prefix = "vocab"
149
+
150
+ vocab_file = os.path.join(save_directory, f"{filename_prefix}.json")
151
+
152
+ with open(vocab_file, 'w', encoding='utf-8') as f:
153
+ json.dump(self.vocab, f, ensure_ascii=False, indent=2)
154
+
155
+ return (vocab_file,)
156
+
157
+ def save_pretrained(self, save_directory: str, legacy_format: bool = True, filename_prefix: Optional[str] = None, **kwargs):
158
+ """
159
+ Save the tokenizer configuration and vocabulary.
160
+ """
161
+ # Save the vocabulary
162
+ vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
163
+
164
+ # Save the config
165
+ config = {
166
+ 'k': self.k,
167
+ 'model_max_length': self.model_max_length,
168
+ 'padding_side': self.padding_side,
169
+ 'truncation_side': self.truncation_side,
170
+ 'special_tokens': {
171
+ 'pad_token': self.pad_token,
172
+ 'unk_token': self.unk_token,
173
+ 'sep_token': self.sep_token,
174
+ 'cls_token': self.cls_token,
175
+ 'mask_token': self.mask_token,
176
+ 'bos_token': self.bos_token,
177
+ 'eos_token': self.eos_token,
178
+ }
179
+ }
180
+
181
+ super().save_pretrained(save_directory, config=config, legacy_format=legacy_format, **kwargs)
182
+
183
+ return vocab_files
184
+
185
+ @classmethod
186
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
187
+ """
188
+ Load a tokenizer from a pretrained model.
189
+ """
190
+ from huggingface_hub import hf_hub_download
191
+
192
+ # Check if it's a local path or HuggingFace repo
193
+ if os.path.isdir(pretrained_model_name_or_path):
194
+ # Local directory
195
+ config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
196
+ vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
197
+ else:
198
+ # HuggingFace Hub
199
+ config_file = hf_hub_download(
200
+ repo_id=pretrained_model_name_or_path,
201
+ filename="tokenizer_config.json"
202
+ )
203
+ vocab_file = hf_hub_download(
204
+ repo_id=pretrained_model_name_or_path,
205
+ filename="vocab.json"
206
+ )
207
+
208
+ # Load config
209
+ with open(config_file, 'r', encoding='utf-8') as f:
210
+ config = json.load(f)
211
+
212
+ # Load vocab
213
+ with open(vocab_file, 'r', encoding='utf-8') as f:
214
+ vocab = json.load(f)
215
+
216
+ k = config.get('k')
217
+
218
+ # Create tokenizer instance - tokens are at top level in tokenizer_config.json
219
+ tokenizer = cls(
220
+ k=k,
221
+ model_max_length=config.get('model_max_length', 2048),
222
+ pad_token=config.get('pad_token', '[PAD]'),
223
+ unk_token=config.get('unk_token', '[UNK]'),
224
+ sep_token=config.get('sep_token', '[SEP]'),
225
+ cls_token=config.get('cls_token', '[CLS]'),
226
+ mask_token=config.get('mask_token', '[MASK]'),
227
+ bos_token=config.get('bos_token', '[BOS]'),
228
+ eos_token=config.get('eos_token', '[EOS]'),
229
+ **kwargs
230
+ )
231
+
232
+ # Override the vocabulary with the saved one
233
+ tokenizer.vocab = vocab
234
+ tokenizer.ids_to_tokens = {idx: token for token, idx in vocab.items()}
235
+
236
+ return tokenizer
tokenizer_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BOS]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[EOS]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "bos_token": "[BOS]",
61
+ "clean_up_tokenization_spaces": false,
62
+ "cls_token": "[CLS]",
63
+ "eos_token": "[EOS]",
64
+ "extra_special_tokens": {},
65
+ "mask_token": "[MASK]",
66
+ "model_max_length": 1025,
67
+ "pad_token": "[PAD]",
68
+ "sep_token": "[SEP]",
69
+ "tokenizer_class": "NucEL_Tokenizer",
70
+ "unk_token": "[UNK]"
71
+ }
vocab.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[UNK]": 1,
4
+ "[CLS]": 2,
5
+ "[SEP]": 3,
6
+ "[MASK]": 4,
7
+ "[BOS]": 5,
8
+ "[EOS]": 6,
9
+ "A": 11,
10
+ "C": 12,
11
+ "G": 13,
12
+ "T": 14,
13
+ "[RESERVED_0]": 15,
14
+ "[RESERVED_1]": 16,
15
+ "[RESERVED_2]": 17,
16
+ "[RESERVED_3]": 18,
17
+ "[RESERVED_4]": 19,
18
+ "[RESERVED_5]": 20,
19
+ "[RESERVED_6]": 21,
20
+ "[RESERVED_7]": 22,
21
+ "[RESERVED_8]": 23,
22
+ "[RESERVED_9]": 24,
23
+ "[RESERVED_10]": 25,
24
+ "[RESERVED_11]": 26,
25
+ "[RESERVED_12]": 27,
26
+ "[RESERVED_13]": 28,
27
+ "[RESERVED_14]": 29,
28
+ "[RESERVED_15]": 30
29
+ }