AmirErez commited on
Commit
4b32750
·
verified ·
1 Parent(s): 697a6a2

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md CHANGED
@@ -1,3 +1,57 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - biology
5
+ - genomics
6
+ - dna-sequence
7
+ - bacterial-classification
8
+ - bert
9
+ - transformers
10
+ ---
11
+
12
+ # BBERT Pre-trained Models
13
+
14
+ Pre-trained models for [BBERT](https://github.com/AmirErez/BBERT) - BERT for Bacterial DNA Classification.
15
+
16
+ ## Models Included
17
+
18
+ ### 1. BBERT Transformer (`bbert_checkpoint-32500/`)
19
+ - Main BERT-based model trained on bacterial DNA sequences
20
+ - Hidden size: 768
21
+ - Trained on diverse bacterial genomes
22
+
23
+ ### 2. Bacterial Classifier (`bacterial_classifier/epoch_80.pt`)
24
+ - Binary classifier for bacterial vs. non-bacterial sequences
25
+ - Input: BBERT embeddings (768-dim)
26
+ - Trained for 80 epochs on 3.9M sequences
27
+
28
+ ### 3. Reading Frame Classifier (`frame_classifier/classifier_model_2000K_37e.pth`)
29
+ - 6-way classifier for reading frame prediction
30
+ - Frames: +1, +2, +3, -1, -2, -3
31
+ - Trained for 37 epochs on 2M sequences
32
+
33
+ ### 4. Coding Sequence Classifier (`coding_classifier/epoch_46.pt`)
34
+ - Binary classifier for coding vs. non-coding sequences
35
+ - Trained for 46 epochs on 3.9M sequences
36
+
37
+ ## Usage
38
+
39
+ These models are automatically downloaded when using BBERT:
40
+
41
+ \`\`\`bash
42
+ # First time setup
43
+ pip install bbert # or clone from GitHub
44
+ python source/download_models.py
45
+
46
+ # Then use normally
47
+ python bbert.py your_sequences.fasta --output_dir results
48
+ \`\`\`
49
+
50
+ ## Citation
51
+
52
+ If you use BBERT, please cite:
53
+ [Add your citation here]
54
+
55
+ ## License
56
+
57
+ MIT License - see LICENSE file for details
bacterial_classifier/epoch_80.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bd4cb77cd552e46ca02402510e679fcb9db30f06f5c713f466c04d092f2f140
3
+ size 1883532
bbert_checkpoint-32500/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/sci/home/alekhin_dm_81/projects/BBERTooD/models/diverse_bact_12_768_6_20000/checkpoint-28500",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 6,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.30.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 12
25
+ }
bbert_checkpoint-32500/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.30.2"
5
+ }
bbert_checkpoint-32500/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3890ee18f7cf983c405984004989c03fa93fc63b005bf8887918a773a3ebe04
3
+ size 117480438
bbert_checkpoint-32500/special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<cls>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<msk>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "<sep>",
8
+ "unk_token": "<unk>"
9
+ }
bbert_checkpoint-32500/tokenizer.json ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 102,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 102
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 3,
16
+ "pad_type_id": 0,
17
+ "pad_token": "<pad>"
18
+ },
19
+ "added_tokens": [
20
+ {
21
+ "id": 0,
22
+ "content": "<s>",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 1,
31
+ "content": "</s>",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 2,
40
+ "content": "<unk>",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 3,
49
+ "content": "<pad>",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ },
56
+ {
57
+ "id": 4,
58
+ "content": "<cls>",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ },
65
+ {
66
+ "id": 5,
67
+ "content": "<sep>",
68
+ "single_word": false,
69
+ "lstrip": false,
70
+ "rstrip": false,
71
+ "normalized": false,
72
+ "special": true
73
+ },
74
+ {
75
+ "id": 6,
76
+ "content": "<msk>",
77
+ "single_word": false,
78
+ "lstrip": false,
79
+ "rstrip": false,
80
+ "normalized": false,
81
+ "special": true
82
+ },
83
+ {
84
+ "id": 7,
85
+ "content": "T",
86
+ "single_word": false,
87
+ "lstrip": false,
88
+ "rstrip": false,
89
+ "normalized": true,
90
+ "special": false
91
+ },
92
+ {
93
+ "id": 8,
94
+ "content": "C",
95
+ "single_word": false,
96
+ "lstrip": false,
97
+ "rstrip": false,
98
+ "normalized": true,
99
+ "special": false
100
+ },
101
+ {
102
+ "id": 9,
103
+ "content": "A",
104
+ "single_word": false,
105
+ "lstrip": false,
106
+ "rstrip": false,
107
+ "normalized": true,
108
+ "special": false
109
+ },
110
+ {
111
+ "id": 10,
112
+ "content": "G",
113
+ "single_word": false,
114
+ "lstrip": false,
115
+ "rstrip": false,
116
+ "normalized": true,
117
+ "special": false
118
+ },
119
+ {
120
+ "id": 11,
121
+ "content": "N",
122
+ "single_word": false,
123
+ "lstrip": false,
124
+ "rstrip": false,
125
+ "normalized": true,
126
+ "special": false
127
+ }
128
+ ],
129
+ "normalizer": null,
130
+ "pre_tokenizer": {
131
+ "type": "Whitespace"
132
+ },
133
+ "post_processor": {
134
+ "type": "TemplateProcessing",
135
+ "single": [
136
+ {
137
+ "SpecialToken": {
138
+ "id": "<cls>",
139
+ "type_id": 0
140
+ }
141
+ },
142
+ {
143
+ "Sequence": {
144
+ "id": "A",
145
+ "type_id": 0
146
+ }
147
+ },
148
+ {
149
+ "SpecialToken": {
150
+ "id": "<sep>",
151
+ "type_id": 0
152
+ }
153
+ }
154
+ ],
155
+ "pair": [
156
+ {
157
+ "SpecialToken": {
158
+ "id": "<cls>",
159
+ "type_id": 0
160
+ }
161
+ },
162
+ {
163
+ "Sequence": {
164
+ "id": "A",
165
+ "type_id": 0
166
+ }
167
+ },
168
+ {
169
+ "SpecialToken": {
170
+ "id": "<sep>",
171
+ "type_id": 0
172
+ }
173
+ },
174
+ {
175
+ "Sequence": {
176
+ "id": "B",
177
+ "type_id": 1
178
+ }
179
+ },
180
+ {
181
+ "SpecialToken": {
182
+ "id": "<sep>",
183
+ "type_id": 1
184
+ }
185
+ }
186
+ ],
187
+ "special_tokens": {
188
+ "<cls>": {
189
+ "id": "<cls>",
190
+ "ids": [
191
+ 4
192
+ ],
193
+ "tokens": [
194
+ "<cls>"
195
+ ]
196
+ },
197
+ "<sep>": {
198
+ "id": "<sep>",
199
+ "ids": [
200
+ 5
201
+ ],
202
+ "tokens": [
203
+ "<sep>"
204
+ ]
205
+ }
206
+ }
207
+ },
208
+ "decoder": null,
209
+ "model": {
210
+ "type": "WordLevel",
211
+ "vocab": {},
212
+ "unk_token": "<unk>"
213
+ }
214
+ }
bbert_checkpoint-32500/tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<cls>",
5
+ "eos_token": "</s>",
6
+ "mask_token": "<msk>",
7
+ "model_max_length": 512,
8
+ "pad_token": "<pad>",
9
+ "padding_side": "right",
10
+ "sep_token": "<sep>",
11
+ "tokenizer_class": "PreTrainedTokenizerFast",
12
+ "unk_token": "<unk>"
13
+ }
bbert_checkpoint-32500/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
bbert_checkpoint-32500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9cbf5dca0a2e1254f310a48edbead0bcf0c8e2e720a9143a0f426ecc9f53a88
3
+ size 4536
coding_classifier/epoch_46.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9219e170516b12c2fca76818fee12782dc22a8623e5e8ba533fc6ee3cfc9c95
3
+ size 1883532
frame_classifier/classifier_model_2000K_37e.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83337c01f258bea94292daf0906290a3a63e1b220da7895b6f06bd9482708b43
3
+ size 5643940