dipayan26 commited on
Commit
e3d31c9
Β·
verified Β·
1 Parent(s): 1750512

Upload tokenization_plant_protein_bert.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_plant_protein_bert.py +132 -0
tokenization_plant_protein_bert.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HuggingFace-compatible tokenizer for Plant Protein BERT.
2
+
3
+ Self-contained tokenizer file for loading from HuggingFace Hub
4
+ with ``trust_remote_code=True``. No external project dependencies.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import os
11
+ from typing import Dict, List, Optional, Tuple
12
+
13
+ from transformers import PreTrainedTokenizer
14
+
15
+
16
+ # ── Vocabulary ───────────────────────────────────────────────────────
17
+
18
+ SPECIAL_TOKENS = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
19
+ AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY")
20
+ VOCAB = SPECIAL_TOKENS + AMINO_ACIDS
21
+ VOCAB_SIZE = len(VOCAB) # 25
22
+
23
+ VOCAB_FILE_NAME = "vocab.json"
24
+
25
+
26
+ class PlantProteinBertTokenizer(PreTrainedTokenizer):
27
+ """Character-level amino acid tokenizer for protein sequences.
28
+
29
+ Maps each of the 20 standard amino acids and 5 special tokens
30
+ to integer IDs.
31
+
32
+ Vocabulary (25 tokens)::
33
+
34
+ [PAD]=0 [CLS]=1 [SEP]=2 [MASK]=3 [UNK]=4
35
+ A=5 C=6 D=7 E=8 F=9 G=10 H=11 I=12 K=13 L=14
36
+ M=15 N=16 P=17 Q=18 R=19 S=20 T=21 V=22 W=23 Y=24
37
+ """
38
+
39
+ vocab_files_names = {"vocab_file": VOCAB_FILE_NAME}
40
+ model_input_names = ["input_ids", "attention_mask"]
41
+
42
+ def __init__(
43
+ self,
44
+ vocab_file=None,
45
+ unk_token="[UNK]",
46
+ sep_token="[SEP]",
47
+ pad_token="[PAD]",
48
+ cls_token="[CLS]",
49
+ mask_token="[MASK]",
50
+ model_max_length=1024,
51
+ **kwargs,
52
+ ):
53
+ if vocab_file is not None and os.path.isfile(vocab_file):
54
+ with open(vocab_file, "r", encoding="utf-8") as f:
55
+ self._vocab: Dict[str, int] = json.load(f)
56
+ else:
57
+ self._vocab = {tok: idx for idx, tok in enumerate(VOCAB)}
58
+
59
+ self._id_to_token: Dict[int, str] = {v: k for k, v in self._vocab.items()}
60
+
61
+ super().__init__(
62
+ unk_token=unk_token,
63
+ sep_token=sep_token,
64
+ pad_token=pad_token,
65
+ cls_token=cls_token,
66
+ mask_token=mask_token,
67
+ model_max_length=model_max_length,
68
+ **kwargs,
69
+ )
70
+
71
+ @property
72
+ def vocab_size(self) -> int:
73
+ return len(self._vocab)
74
+
75
+ def get_vocab(self) -> Dict[str, int]:
76
+ return dict(self._vocab)
77
+
78
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
79
+ return list(text.upper())
80
+
81
+ def _convert_token_to_id(self, token: str) -> int:
82
+ return self._vocab.get(token, self._vocab.get("[UNK]", 4))
83
+
84
+ def _convert_id_to_token(self, index: int) -> str:
85
+ return self._id_to_token.get(index, "[UNK]")
86
+
87
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
88
+ return "".join(tokens)
89
+
90
+ def build_inputs_with_special_tokens(
91
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
92
+ ) -> List[int]:
93
+ cls_id = [self.cls_token_id]
94
+ sep_id = [self.sep_token_id]
95
+ if token_ids_1 is None:
96
+ return cls_id + token_ids_0 + sep_id
97
+ return cls_id + token_ids_0 + sep_id + token_ids_1 + sep_id
98
+
99
+ def get_special_tokens_mask(
100
+ self, token_ids_0, token_ids_1=None, already_has_special_tokens=False,
101
+ ) -> List[int]:
102
+ if already_has_special_tokens:
103
+ return super().get_special_tokens_mask(
104
+ token_ids_0=token_ids_0,
105
+ token_ids_1=token_ids_1,
106
+ already_has_special_tokens=True,
107
+ )
108
+ if token_ids_1 is None:
109
+ return [1] + [0] * len(token_ids_0) + [1]
110
+ return [1] + [0] * len(token_ids_0) + [1] + [0] * len(token_ids_1) + [1]
111
+
112
+ def create_token_type_ids_from_sequences(
113
+ self, token_ids_0, token_ids_1=None,
114
+ ) -> List[int]:
115
+ cls_id = [self.cls_token_id]
116
+ sep_id = [self.sep_token_id]
117
+ if token_ids_1 is None:
118
+ return [0] * len(cls_id + token_ids_0 + sep_id)
119
+ return [0] * len(cls_id + token_ids_0 + sep_id) + [1] * len(token_ids_1 + sep_id)
120
+
121
+ def save_vocabulary(
122
+ self, save_directory: str, filename_prefix: Optional[str] = None,
123
+ ) -> Tuple[str]:
124
+ if not os.path.isdir(save_directory):
125
+ os.makedirs(save_directory, exist_ok=True)
126
+ vocab_file = os.path.join(
127
+ save_directory,
128
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILE_NAME,
129
+ )
130
+ with open(vocab_file, "w", encoding="utf-8") as f:
131
+ json.dump(self._vocab, f, indent=2, ensure_ascii=False)
132
+ return (vocab_file,)