Upload 6 files

Browse files

Files changed (6) hide show

config.json +52 -0
omnigenome_wrapper.py +108 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
vocab.txt +69 -0

config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "do_sample": false,
+  "eos_token_ids": 0,
+  "finetuning_task": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "is_decoder": false,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "layer_norm_eps": 1e-12,
+  "length_penalty": 1.0,
+  "max_length": 10,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_beams": 1,
+  "num_hidden_layers": 12,
+  "num_labels": 2,
+  "num_return_sequences": 1,
+  "num_rnn_layer": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pruned_heads": {},
+  "repetition_penalty": 1.0,
+  "rnn": "lstm",
+  "rnn_dropout": 0.0,
+  "rnn_hidden": 768,
+  "split": 10,
+  "temperature": 1.0,
+  "top_k": 50,
+  "top_p": 1.0,
+  "torchscript": false,
+  "type_vocab_size": 2,
+  "use_bfloat16": false,
+  "vocab_size": 69
+}

omnigenome_wrapper.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# -*- coding: utf-8 -*-
+# file: omnigenbench_wrapper.py
+# time: 00:57 27/04/2024
+# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
+# github: https://github.com/yangheng95
+# huggingface: https://huggingface.co/yangheng
+# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
+# Copyright (C) 2019-2024. All Rights Reserved.
+import warnings
+from transformers import AutoTokenizer
+from omnigenbench import OmniKmersTokenizer
+class Tokenizer(OmniKmersTokenizer):
+    def __init__(
+        self, base_tokenizer=None, k=3, overlap=0, max_length=512, t2u=True, **kwargs
+    ):
+        super(Tokenizer, self).__init__(base_tokenizer, t2u=t2u, **kwargs)
+        self.k = k
+        self.overlap = overlap
+        self.max_length = max_length
+        self.metadata["tokenizer_name"] = self.__class__.__name__
+    def __call__(self, sequence, **kwargs):
+        if self.u2t:
+            sequence = "".join([seq.replace("U", "T").upper() for seq in sequence])
+        if self.t2u:
+            sequence = "".join([seq.replace("T", "U").upper() for seq in sequence])
+        sequence_tokens = self.tokenize(sequence)[
+            : kwargs.get("max_length", self.max_length) - 2
+        ]
+        tokenized_inputs = {
+            "input_ids": [],
+            "attention_mask": [],
+        }
+        bos_id = (
+            self.base_tokenizer.bos_token_id
+            if self.base_tokenizer.bos_token_id is not None
+            else self.base_tokenizer.cls_token_id
+        )
+        eos_id = (
+            self.base_tokenizer.eos_token_id
+            if self.base_tokenizer.eos_token_id is not None
+            else self.base_tokenizer.sep_token_id
+        )
+        for tokens in sequence_tokens:
+            tokenized_inputs["input_ids"].append(
+                [bos_id] + self.base_tokenizer.convert_tokens_to_ids(tokens) + [eos_id]
+            )
+            tokenized_inputs["attention_mask"].append(
+                [1] * len(tokenized_inputs["input_ids"][-1])
+            )
+        for i, ids in enumerate(tokenized_inputs["input_ids"]):
+            if ids.count(self.base_tokenizer.unk_token_id) / len(ids) > 0.1:
+                warnings.warn(
+                    f"Unknown tokens are more than 10% in the {i}th sequence, please check the tokenization process."
+                )
+        tokenized_inputs = self.base_tokenizer.pad(
+            tokenized_inputs,
+            padding="max_length",
+            max_length=self.max_length
+            if not kwargs.get("max_length", None)
+            else kwargs.get("max_length"),
+            pad_to_multiple_of=self.max_length
+            if not kwargs.get("max_length", None)
+            else kwargs.get("max_length"),
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        return tokenized_inputs
+    @staticmethod
+    def from_pretrained(model_name_or_path, **kwargs):
+        self = OmniKmersTokenizer(
+            AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
+        )
+        return self
+    def tokenize(self, sequence, **kwargs):
+        if isinstance(sequence, str):
+            sequences = [sequence]
+        else:
+            sequences = sequence
+        sequence_tokens = []
+        for i in range(len(sequences)):
+            tokens = []
+            for j in range(0, len(sequences[i]), self.k - self.overlap):
+                tokens.append(sequences[i][j : j + self.k])
+            sequence_tokens.append(tokens)
+        return sequence_tokens
+    def encode(self, input_ids, **kwargs):
+        return self.base_tokenizer.encode(input_ids, **kwargs)
+    def decode(self, input_ids, **kwargs):
+        return self.base_tokenizer.decode(input_ids, **kwargs)
+    def encode_plus(self, sequence, **kwargs):
+        raise NotImplementedError("The encode_plus() function is not implemented yet.")

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7aca71823ab74771006be1030d9e7239220bba40a16858575929a02e6d2a7471
+size 346827305

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": false, "max_len": 512}

vocab.txt ADDED Viewed

	@@ -0,0 +1,69 @@

+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+AAA
+AAU
+AAC
+AAG
+AUA
+AUU
+AUC
+AUG
+ACA
+ACU
+ACC
+ACG
+AGA
+AGU
+AGC
+AGG
+UAA
+UAU
+UAC
+UAG
+UUA
+UUU
+UUC
+UUG
+UCA
+UCU
+UCC
+UCG
+UGA
+UGU
+UGC
+UGG
+CAA
+CAU
+CAC
+CAG
+CUA
+CUU
+CUC
+CUG
+CCA
+CCU
+CCC
+CCG
+CGA
+CGU
+CGC
+CGG
+GAA
+GAU
+GAC
+GAG
+GUA
+GUU
+GUC
+GUG
+GCA
+GCU
+GCC
+GCG
+GGA
+GGU
+GGC
+GGG