Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

.DS_Store +0 -0
README.md +57 -3
bacterial_classifier/epoch_80.pt +3 -0
bbert_checkpoint-32500/config.json +25 -0
bbert_checkpoint-32500/generation_config.json +5 -0
bbert_checkpoint-32500/pytorch_model.bin +3 -0
bbert_checkpoint-32500/special_tokens_map.json +9 -0
bbert_checkpoint-32500/tokenizer.json +214 -0
bbert_checkpoint-32500/tokenizer_config.json +13 -0
bbert_checkpoint-32500/trainer_state.json +0 -0
bbert_checkpoint-32500/training_args.bin +3 -0
coding_classifier/epoch_46.pt +3 -0
frame_classifier/classifier_model_2000K_37e.pth +3 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

README.md CHANGED Viewed

@@ -1,3 +1,57 @@
----
-license: mit
----

+---
+license: mit
+tags:
+- biology
+- genomics
+- dna-sequence
+- bacterial-classification
+- bert
+- transformers
+---
+# BBERT Pre-trained Models
+Pre-trained models for [BBERT](https://github.com/AmirErez/BBERT) - BERT for Bacterial DNA Classification.
+## Models Included
+### 1. BBERT Transformer (`bbert_checkpoint-32500/`)
+- Main BERT-based model trained on bacterial DNA sequences
+- Hidden size: 768
+- Trained on diverse bacterial genomes
+### 2. Bacterial Classifier (`bacterial_classifier/epoch_80.pt`)
+- Binary classifier for bacterial vs. non-bacterial sequences
+- Input: BBERT embeddings (768-dim)
+- Trained for 80 epochs on 3.9M sequences
+### 3. Reading Frame Classifier (`frame_classifier/classifier_model_2000K_37e.pth`)
+- 6-way classifier for reading frame prediction
+- Frames: +1, +2, +3, -1, -2, -3
+- Trained for 37 epochs on 2M sequences
+### 4. Coding Sequence Classifier (`coding_classifier/epoch_46.pt`)
+- Binary classifier for coding vs. non-coding sequences
+- Trained for 46 epochs on 3.9M sequences
+## Usage
+These models are automatically downloaded when using BBERT:
+\`\`\`bash
+# First time setup
+pip install bbert  # or clone from GitHub
+python source/download_models.py
+# Then use normally
+python bbert.py your_sequences.fasta --output_dir results
+\`\`\`
+## Citation
+If you use BBERT, please cite:
+[Add your citation here]
+## License
+MIT License - see LICENSE file for details

bacterial_classifier/epoch_80.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd4cb77cd552e46ca02402510e679fcb9db30f06f5c713f466c04d092f2f140
+size 1883532

bbert_checkpoint-32500/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "/sci/home/alekhin_dm_81/projects/BBERTooD/models/diverse_bact_12_768_6_20000/checkpoint-28500",
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 12
+}

bbert_checkpoint-32500/generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}

bbert_checkpoint-32500/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3890ee18f7cf983c405984004989c03fa93fc63b005bf8887918a773a3ebe04
+size 117480438

bbert_checkpoint-32500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<cls>",
+  "eos_token": "</s>",
+  "mask_token": "<msk>",
+  "pad_token": "<pad>",
+  "sep_token": "<sep>",
+  "unk_token": "<unk>"
+}

bbert_checkpoint-32500/tokenizer.json ADDED Viewed

	@@ -0,0 +1,214 @@

+{
+  "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 102,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 102
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 3,
+    "pad_type_id": 0,
+    "pad_token": "<pad>"
+  },
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "</s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "<cls>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 5,
+      "content": "<sep>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 6,
+      "content": "<msk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 7,
+      "content": "T",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 8,
+      "content": "C",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 9,
+      "content": "A",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 10,
+      "content": "G",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 11,
+      "content": "N",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<cls>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<sep>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "<cls>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<sep>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<sep>",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<cls>": {
+        "id": "<cls>",
+        "ids": [
+          4
+        ],
+        "tokens": [
+          "<cls>"
+        ]
+      },
+      "<sep>": {
+        "id": "<sep>",
+        "ids": [
+          5
+        ],
+        "tokens": [
+          "<sep>"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {},
+    "unk_token": "<unk>"
+  }
+}

bbert_checkpoint-32500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<cls>",
+  "eos_token": "</s>",
+  "mask_token": "<msk>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "sep_token": "<sep>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

bbert_checkpoint-32500/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bbert_checkpoint-32500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9cbf5dca0a2e1254f310a48edbead0bcf0c8e2e720a9143a0f426ecc9f53a88
+size 4536

coding_classifier/epoch_46.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9219e170516b12c2fca76818fee12782dc22a8623e5e8ba533fc6ee3cfc9c95
+size 1883532

frame_classifier/classifier_model_2000K_37e.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83337c01f258bea94292daf0906290a3a63e1b220da7895b6f06bd9482708b43
+size 5643940