Uploaded model and tokenizer files

Browse files

Files changed (6) hide show

README.md +98 -5
config.json +32 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +11 -0

README.md CHANGED Viewed

@@ -1,5 +1,98 @@
----
-license: apache-2.0
-base_model:
-- zhihan1996/DNABERT-2-117M
----

+---
+license: apache-2.0
+base_model:
+  - zhihan1996/DNABERT-2-117M
+tags:
+  - biology
+  - medical
+---
+This is one of the fine-tuned models, named SNL model, from [zhihan1996/DNABERT-2-117M
+](https://huggingface.co/zhihan1996/DNABERT-2-117M).
+The SNL model can predict the RNA offtarget induced by cytosine base editors (CBEs).
+Here is an example of using the model for RNA-off-target prediction.
+**pred_rna_offtarget.py:**
+```python
+import sys
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+__authors__ = ["Kazuki Nakamae"]
+__version__ = "1.0.0"
+def pred_rna_offtarget(dna, model_dir):
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+        model = AutoModelForSequenceClassification.from_pretrained(model_dir, trust_remote_code=True).to(device)
+    except Exception as e:
+        print(f"Error loading model from {model_dir}: {e}")
+        sys.exit(1)
+    inputs = tokenizer(dna, return_tensors='pt')
+    model.eval()
+    with torch.no_grad():
+      outputs = model(
+          inputs["input_ids"].to(device),
+          inputs["attention_mask"].to(device),
+        )
+    print("[Negative, Positive]")
+    print(outputs.logits)
+    y_preds = np.argmax(outputs.logits.to('cpu').detach().numpy().copy(), axis=1)
+    def id2label(x):
+        return model.config.id2label[x]
+    y_dash = [id2label(x) for x in y_preds]
+    print("Result:")
+    print(y_dash)
+    # LABEL_0: Not RNA-offtarget / LABEL_1: RNA-offtarget
+    return (dna, y_dash)
+def print_usage():
+    print(f"Usage: {sys.argv[0]} <input DNA sequence> <DNABERT-2 model directory>")
+    print("Options:")
+    print("  -h, --help    Show this help message and exit")
+    print("  -v, --version Show version information and exit")
+def print_version():
+    print(f"{sys.argv[0]} version {__version__}")
+    print("Authors:", ", ".join(__authors__))
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        if len(sys.argv) == 2 and sys.argv[1] in ("-h", "--help"):
+            print_usage()
+            sys.exit(0)
+        elif len(sys.argv) == 2 and sys.argv[1] in ("-v", "--version"):
+            print_version()
+            sys.exit(0)
+        else:
+            print_usage()
+            sys.exit(1)
+    dna = sys.argv[1]
+    model_dir = sys.argv[2]
+    pred_rna_offtarget(dna, model_dir)
+```
+```bash
+$ python pred_rna_offtarget.py GGCAGGGCTGGGGAAGCTTACTGTGTCCAAGAGCCTGCTG KazukiNakamae/SNLmodel;
+[Negative, Positive]
+tensor([[-0.7521,  0.4817]])
+Result:
+['LABEL_1']
+$ python pred_rna_offtarget.py GTCATCTAACAAAAATATTCCGTTGCAGGAAAAGCAAGCT KazukiNakamae/SNLmodel;
+[Negative, Positive]
+tensor([[ 0.9211, -0.8157]])
+Result:
+['LABEL_0']
+```
+#### Developers of the fine-tuned model
+- [Takayuki Suzuki](https://github.com/szktkyk)
+- [Kazuki Nakamae](https://github.com/KazukiNakamae)

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "tmp/DNABERT-2-CBE_Suzuki_Nakamae_v1/",
+  "alibi_starting_size": 512,
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "zhihan1996/DNABERT-2-117M--configuration_bert.BertConfig",
+    "AutoModel": "zhihan1996/DNABERT-2-117M--bert_layers.BertModel",
+    "AutoModelForMaskedLM": "zhihan1996/DNABERT-2-117M--bert_layers.BertForMaskedLM",
+    "AutoModelForSequenceClassification": "zhihan1996/DNABERT-2-117M--bert_layers.BertForSequenceClassification"
+  },
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 4096
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2632dda98f60a768ef26c5932c48a650fa7f132b342153c759d9d7040c7bdda5
+size 468326010

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "model_max_length": 10,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]"
+}