yangheng commited on
Commit
62ab4cc
·
verified ·
1 Parent(s): d930506

Upload 9 files

Browse files
__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # file: __init__.py
3
+ # time: 13:48 04/06/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./mlm_on_tx.510nt",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 512,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 2048,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 6,
18
+ "output_hidden_states": true,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.20.1",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 10
26
+ }
model_wrapper.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # file: model_wrapper.py
3
+ # time: 01:00 27/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
omnigenome_wrapper.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # file: omnigenbench_wrapper.py
3
+ # time: 00:57 27/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.
9
+
10
+ import warnings
11
+
12
+ from omnigenbench import OmniTokenizer
13
+
14
+
15
+ class Tokenizer(OmniTokenizer):
16
+ def __init__(self, base_tokenizer=None, u2t=True, add_whitespace=False, **kwargs):
17
+ super(Tokenizer, self).__init__(
18
+ base_tokenizer, u2t=u2t, add_whitespace=add_whitespace, **kwargs
19
+ )
20
+ self.metadata["tokenizer_name"] = self.__class__.__name__
21
+
22
+ def __call__(self, sequence, **kwargs):
23
+ if self.u2t:
24
+ sequence = "".join([seq.replace("U", "T").upper() for seq in sequence])
25
+ if self.t2u:
26
+ sequence = "".join([seq.replace("T", "U").upper() for seq in sequence])
27
+ if self.add_whitespace:
28
+ sequence = " ".join(list(sequence))
29
+ sequence_tokens = self.tokenize(sequence)[
30
+ : kwargs.get("max_length", self.max_length) - 2
31
+ ]
32
+ tokenized_inputs = {
33
+ "input_ids": [],
34
+ "attention_mask": [],
35
+ }
36
+ bos_id = (
37
+ self.base_tokenizer.bos_token_id
38
+ if self.base_tokenizer.bos_token_id is not None
39
+ else self.base_tokenizer.cls_token_id
40
+ )
41
+ eos_id = (
42
+ self.base_tokenizer.eos_token_id
43
+ if self.base_tokenizer.eos_token_id is not None
44
+ else self.base_tokenizer.sep_token_id
45
+ )
46
+ for tokens in sequence_tokens:
47
+ tokenized_inputs["input_ids"].append(
48
+ [bos_id] + self.base_tokenizer.convert_tokens_to_ids(tokens) + [eos_id]
49
+ )
50
+ tokenized_inputs["attention_mask"].append(
51
+ [1] * len(tokenized_inputs["input_ids"][-1])
52
+ )
53
+
54
+ for i, ids in enumerate(tokenized_inputs["input_ids"]):
55
+ if ids.count(self.base_tokenizer.unk_token_id) / len(ids) > 0.1:
56
+ warnings.warn(
57
+ f"Unknown tokens are more than "
58
+ f"{ids.count(self.base_tokenizer.unk_token_id) / len(ids)}% in the {i}-th sequence, "
59
+ f"please check the tokenization process."
60
+ )
61
+ max_length = max(len(ids) for ids in tokenized_inputs["input_ids"])
62
+ tokenized_inputs = self.base_tokenizer.pad(
63
+ tokenized_inputs,
64
+ padding=kwargs.get("padding", "max_length"),
65
+ max_length=min(max_length, kwargs.get("max_length", 512)),
66
+ return_attention_mask=kwargs.get("return_attention_mask", True),
67
+ return_tensors="pt",
68
+ )
69
+ return tokenized_inputs
70
+
71
+ def tokenize(self, sequence, **kwargs):
72
+ if isinstance(sequence, str):
73
+ sequences = [sequence]
74
+ else:
75
+ sequences = sequence
76
+
77
+ sequence_tokens = []
78
+ for i in range(len(sequences)):
79
+ sequence_tokens.append(list(sequences[i]))
80
+
81
+ return sequence_tokens
82
+
83
+ def encode(self, sequence, **kwargs):
84
+ return self.base_tokenizer.encode(sequence, **kwargs)
85
+
86
+ def decode(self, sequence, **kwargs):
87
+ return self.base_tokenizer.decode(sequence, **kwargs)
88
+
89
+ def encode_plus(self, sequence, **kwargs):
90
+ return self.base_tokenizer.encode_plus(sequence, **kwargs)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3a48c768a029dc6291cde8085c0bc02d40a6d03007b0535e6788573e2ea7808
3
+ size 77828395
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [],
6
+ "normalizer": {
7
+ "type": "BertNormalizer",
8
+ "clean_text": true,
9
+ "handle_chinese_chars": true,
10
+ "strip_accents": null,
11
+ "lowercase": false
12
+ },
13
+ "pre_tokenizer": {
14
+ "type": "BertPreTokenizer"
15
+ },
16
+ "post_processor": {
17
+ "type": "TemplateProcessing",
18
+ "single": [
19
+ {
20
+ "SpecialToken": {
21
+ "id": "[CLS]",
22
+ "type_id": 0
23
+ }
24
+ },
25
+ {
26
+ "Sequence": {
27
+ "id": "A",
28
+ "type_id": 0
29
+ }
30
+ },
31
+ {
32
+ "SpecialToken": {
33
+ "id": "[SEP]",
34
+ "type_id": 0
35
+ }
36
+ }
37
+ ],
38
+ "pair": [
39
+ {
40
+ "SpecialToken": {
41
+ "id": "[CLS]",
42
+ "type_id": 0
43
+ }
44
+ },
45
+ {
46
+ "Sequence": {
47
+ "id": "A",
48
+ "type_id": 0
49
+ }
50
+ },
51
+ {
52
+ "SpecialToken": {
53
+ "id": "[SEP]",
54
+ "type_id": 0
55
+ }
56
+ },
57
+ {
58
+ "Sequence": {
59
+ "id": "B",
60
+ "type_id": 1
61
+ }
62
+ },
63
+ {
64
+ "SpecialToken": {
65
+ "id": "[SEP]",
66
+ "type_id": 1
67
+ }
68
+ }
69
+ ],
70
+ "special_tokens": {
71
+ "[CLS]": {
72
+ "id": "[CLS]",
73
+ "ids": [
74
+ 2
75
+ ],
76
+ "tokens": [
77
+ "[CLS]"
78
+ ]
79
+ },
80
+ "[SEP]": {
81
+ "id": "[SEP]",
82
+ "ids": [
83
+ 3
84
+ ],
85
+ "tokens": [
86
+ "[SEP]"
87
+ ]
88
+ }
89
+ }
90
+ },
91
+ "decoder": {
92
+ "type": "WordPiece",
93
+ "prefix": "##",
94
+ "cleanup": true
95
+ },
96
+ "model": {
97
+ "type": "WordPiece",
98
+ "unk_token": "[UNK]",
99
+ "continuing_subword_prefix": "##",
100
+ "max_input_chars_per_word": 100,
101
+ "vocab": {
102
+ "[PAD]": 0,
103
+ "[UNK]": 1,
104
+ "[CLS]": 2,
105
+ "[SEP]": 3,
106
+ "[MASK]": 4,
107
+ "N": 5,
108
+ "A": 6,
109
+ "C": 7,
110
+ "G": 8,
111
+ "T": 9
112
+ }
113
+ }
114
+ }
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer"}
tokenizer_wrapper.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # file: tokenizer_wrapper.py
3
+ # time: 00:57 27/04/2024
4
+ # author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
5
+ # github: https://github.com/yangheng95
6
+ # huggingface: https://huggingface.co/yangheng
7
+ # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
8
+ # Copyright (C) 2019-2024. All Rights Reserved.