Upload tokenization_plant_protein_bert.py with huggingface_hub

Browse files

Files changed (1) hide show

tokenization_plant_protein_bert.py +132 -0

tokenization_plant_protein_bert.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""HuggingFace-compatible tokenizer for Plant Protein BERT.
+Self-contained tokenizer file for loading from HuggingFace Hub
+with ``trust_remote_code=True``. No external project dependencies.
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+from transformers import PreTrainedTokenizer
+# ── Vocabulary ───────────────────────────────────────────────────────
+SPECIAL_TOKENS = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
+AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY")
+VOCAB = SPECIAL_TOKENS + AMINO_ACIDS
+VOCAB_SIZE = len(VOCAB)  # 25
+VOCAB_FILE_NAME = "vocab.json"
+class PlantProteinBertTokenizer(PreTrainedTokenizer):
+    """Character-level amino acid tokenizer for protein sequences.
+    Maps each of the 20 standard amino acids and 5 special tokens
+    to integer IDs.
+    Vocabulary (25 tokens)::
+        [PAD]=0  [CLS]=1  [SEP]=2  [MASK]=3  [UNK]=4
+        A=5 C=6 D=7 E=8 F=9 G=10 H=11 I=12 K=13 L=14
+        M=15 N=16 P=17 Q=18 R=19 S=20 T=21 V=22 W=23 Y=24
+    """
+    vocab_files_names = {"vocab_file": VOCAB_FILE_NAME}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        model_max_length=1024,
+        **kwargs,
+    ):
+        if vocab_file is not None and os.path.isfile(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                self._vocab: Dict[str, int] = json.load(f)
+        else:
+            self._vocab = {tok: idx for idx, tok in enumerate(VOCAB)}
+        self._id_to_token: Dict[int, str] = {v: k for k, v in self._vocab.items()}
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            model_max_length=model_max_length,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self._vocab)
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        return list(text.upper())
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab.get("[UNK]", 4))
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._id_to_token.get(index, "[UNK]")
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join(tokens)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
+    ) -> List[int]:
+        cls_id = [self.cls_token_id]
+        sep_id = [self.sep_token_id]
+        if token_ids_1 is None:
+            return cls_id + token_ids_0 + sep_id
+        return cls_id + token_ids_0 + sep_id + token_ids_1 + sep_id
+    def get_special_tokens_mask(
+        self, token_ids_0, token_ids_1=None, already_has_special_tokens=False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        if token_ids_1 is None:
+            return [1] + [0] * len(token_ids_0) + [1]
+        return [1] + [0] * len(token_ids_0) + [1] + [0] * len(token_ids_1) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0, token_ids_1=None,
+    ) -> List[int]:
+        cls_id = [self.cls_token_id]
+        sep_id = [self.sep_token_id]
+        if token_ids_1 is None:
+            return [0] * len(cls_id + token_ids_0 + sep_id)
+        return [0] * len(cls_id + token_ids_0 + sep_id) + [1] * len(token_ids_1 + sep_id)
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None,
+    ) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILE_NAME,
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, indent=2, ensure_ascii=False)
+        return (vocab_file,)