Instructions to use Taykhoom/SpliceBERT-1024nt with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Taykhoom/SpliceBERT-1024nt with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("fill-mask", model="Taykhoom/SpliceBERT-1024nt", trust_remote_code=True)# Load model directly from transformers import AutoModelForMaskedLM model = AutoModelForMaskedLM.from_pretrained("Taykhoom/SpliceBERT-1024nt", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
File size: 3,044 Bytes
fe65700 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | import json
import os
from transformers import PreTrainedTokenizer
_DEFAULT_VOCAB = {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"N": 5,
"A": 6,
"C": 7,
"G": 8,
"T": 9,
}
class SpliceBERTTokenizer(PreTrainedTokenizer):
"""Single-nucleotide tokenizer for SpliceBERT.
Automatically converts U->T and adds [CLS]/[SEP] special tokens.
Raw sequences (not pre-spaced) are accepted.
"""
vocab_files_names = {"vocab_file": "vocab.json"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
cls_token="[CLS]",
sep_token="[SEP]",
pad_token="[PAD]",
mask_token="[MASK]",
unk_token="[UNK]",
**kwargs,
):
self._vocab = dict(_DEFAULT_VOCAB)
if vocab_file and os.path.isfile(vocab_file):
with open(vocab_file) as f:
self._vocab = json.load(f)
self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
super().__init__(
cls_token=cls_token,
sep_token=sep_token,
pad_token=pad_token,
mask_token=mask_token,
unk_token=unk_token,
**kwargs,
)
@property
def vocab_size(self):
return len(self._vocab)
def get_vocab(self):
return dict(self._vocab)
def _tokenize(self, text):
return list(text.upper().replace("U", "T").replace(" ", ""))
def _convert_token_to_id(self, token):
return self._vocab.get(token, self._vocab["[UNK]"])
def _convert_id_to_token(self, index):
return self._ids_to_tokens.get(index, "[UNK]")
def save_vocabulary(self, save_directory, filename_prefix=None):
os.makedirs(save_directory, exist_ok=True)
fname = (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
path = os.path.join(save_directory, fname)
with open(path, "w") as f:
json.dump(self._vocab, f, indent=2)
return (path,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
cls = [self.cls_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + cls + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None,
already_has_special_tokens=False):
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0, token_ids_1, already_has_special_tokens=True
)
mask = [1] + [0] * len(token_ids_0) + [1]
if token_ids_1 is not None:
mask += [1] + [0] * len(token_ids_1) + [1]
return mask
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
if token_ids_1 is None:
return [0] + token_ids_0 + [0]
return [0] + token_ids_0 + [0, 0] + token_ids_1 + [0]
|