pszmk commited on
Commit
8b6cdbf
·
verified ·
1 Parent(s): 2628c81

Upload protein amino-acid fast tokenizer

Browse files
Files changed (4) hide show
  1. README.md +66 -0
  2. special_tokens_map.json +8 -0
  3. tokenizer.json +190 -0
  4. tokenizer_config.json +66 -0
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - protein
6
+ - amino-acid
7
+ - tokenizer
8
+ - biology
9
+ license: mit
10
+ library_name: transformers
11
+ ---
12
+
13
+ # Protein Amino-Acid Fast Tokenizer
14
+
15
+ Fast Rust-backed tokenizer for protein sequences.
16
+
17
+ ## Features
18
+
19
+ - **1 token = 1 amino acid** — character-level tokenization
20
+ - **Fast Rust backend** — efficient processing via HuggingFace Tokenizers
21
+ - **Transformer-ready** — compatible with `AutoTokenizer`
22
+
23
+ ## Usage
24
+
25
+ ```python
26
+ from transformers import AutoTokenizer
27
+
28
+ tokenizer = AutoTokenizer.from_pretrained("pszmk/protein-aa-fast-tokenizer")
29
+
30
+ # Single sequence
31
+ tokens = tokenizer("MKTLLILAVAVCSAA")
32
+ print(tokens)
33
+ # {'input_ids': [2, 16, 14, ...], 'attention_mask': [1, 1, ...]}
34
+
35
+ # Batch with padding
36
+ batch = tokenizer(
37
+ ["MKTLLILAVAVCSAA", "ACDEFGHIK"],
38
+ padding=True,
39
+ return_tensors="pt",
40
+ )
41
+ ```
42
+
43
+ ## Vocabulary
44
+
45
+ | ID | Token | Description |
46
+ |----|-------|-------------|
47
+ | 0 | `<PAD>` | Padding |
48
+ | 1 | `<MASK>` | Masked token |
49
+ | 2 | `<CLS>` | Classification / Start |
50
+ | 3 | `<SEP>` | Separator |
51
+ | 4 | `<EOS>` | End of sequence |
52
+ | 5 | `<UNK>` | Unknown |
53
+ | 6-25 | A-Y | Standard amino acids |
54
+ | 26 | X | Any amino acid |
55
+ | 27 | B | Asparagine or Aspartic acid |
56
+ | 28 | Z | Glutamine or Glutamic acid |
57
+
58
+ ## Template Processing
59
+
60
+ - **Single sequence:** `<CLS> SEQUENCE <EOS>`
61
+ - **Pair sequences:** `<CLS> SEQ_A <SEP> SEQ_B <EOS>`
62
+
63
+ ## Citation
64
+
65
+ Part of the LAMP (Latent Anti-Microbial Peptides) project.
66
+
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "<CLS>",
3
+ "eos_token": "<EOS>",
4
+ "mask_token": "<MASK>",
5
+ "pad_token": "<PAD>",
6
+ "sep_token": "<SEP>",
7
+ "unk_token": "<UNK>"
8
+ }
tokenizer.json ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<PAD>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<MASK>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<CLS>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<SEP>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<EOS>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "<UNK>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ }
60
+ ],
61
+ "normalizer": null,
62
+ "pre_tokenizer": {
63
+ "type": "Split",
64
+ "pattern": {
65
+ "String": ""
66
+ },
67
+ "behavior": "Isolated",
68
+ "invert": false
69
+ },
70
+ "post_processor": {
71
+ "type": "TemplateProcessing",
72
+ "single": [
73
+ {
74
+ "SpecialToken": {
75
+ "id": "<CLS>",
76
+ "type_id": 0
77
+ }
78
+ },
79
+ {
80
+ "Sequence": {
81
+ "id": "A",
82
+ "type_id": 0
83
+ }
84
+ },
85
+ {
86
+ "SpecialToken": {
87
+ "id": "<EOS>",
88
+ "type_id": 0
89
+ }
90
+ }
91
+ ],
92
+ "pair": [
93
+ {
94
+ "SpecialToken": {
95
+ "id": "<CLS>",
96
+ "type_id": 0
97
+ }
98
+ },
99
+ {
100
+ "Sequence": {
101
+ "id": "A",
102
+ "type_id": 0
103
+ }
104
+ },
105
+ {
106
+ "SpecialToken": {
107
+ "id": "<SEP>",
108
+ "type_id": 0
109
+ }
110
+ },
111
+ {
112
+ "Sequence": {
113
+ "id": "B",
114
+ "type_id": 0
115
+ }
116
+ },
117
+ {
118
+ "SpecialToken": {
119
+ "id": "<EOS>",
120
+ "type_id": 0
121
+ }
122
+ }
123
+ ],
124
+ "special_tokens": {
125
+ "<CLS>": {
126
+ "id": "<CLS>",
127
+ "ids": [
128
+ 2
129
+ ],
130
+ "tokens": [
131
+ "<CLS>"
132
+ ]
133
+ },
134
+ "<EOS>": {
135
+ "id": "<EOS>",
136
+ "ids": [
137
+ 4
138
+ ],
139
+ "tokens": [
140
+ "<EOS>"
141
+ ]
142
+ },
143
+ "<SEP>": {
144
+ "id": "<SEP>",
145
+ "ids": [
146
+ 3
147
+ ],
148
+ "tokens": [
149
+ "<SEP>"
150
+ ]
151
+ }
152
+ }
153
+ },
154
+ "decoder": null,
155
+ "model": {
156
+ "type": "WordLevel",
157
+ "vocab": {
158
+ "<PAD>": 0,
159
+ "<MASK>": 1,
160
+ "<CLS>": 2,
161
+ "<SEP>": 3,
162
+ "<EOS>": 4,
163
+ "<UNK>": 5,
164
+ "A": 6,
165
+ "C": 7,
166
+ "D": 8,
167
+ "E": 9,
168
+ "F": 10,
169
+ "G": 11,
170
+ "H": 12,
171
+ "I": 13,
172
+ "K": 14,
173
+ "L": 15,
174
+ "M": 16,
175
+ "N": 17,
176
+ "P": 18,
177
+ "Q": 19,
178
+ "R": 20,
179
+ "S": 21,
180
+ "T": 22,
181
+ "V": 23,
182
+ "W": 24,
183
+ "Y": 25,
184
+ "X": 26,
185
+ "B": 27,
186
+ "Z": 28
187
+ },
188
+ "unk_token": "<UNK>"
189
+ }
190
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<PAD>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<MASK>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<CLS>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<SEP>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<EOS>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<UNK>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "clean_up_tokenization_spaces": false,
53
+ "cls_token": "<CLS>",
54
+ "eos_token": "<EOS>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<MASK>",
57
+ "model_input_names": [
58
+ "input_ids",
59
+ "attention_mask"
60
+ ],
61
+ "model_max_length": 1000000000000000019884624838656,
62
+ "pad_token": "<PAD>",
63
+ "sep_token": "<SEP>",
64
+ "tokenizer_class": "PreTrainedTokenizerFast",
65
+ "unk_token": "<UNK>"
66
+ }