Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

__pycache__/__init__.cpython-311.pyc +0 -0
__pycache__/configuration_helmbert.cpython-311.pyc +0 -0
__pycache__/modeling_helmbert.cpython-311.pyc +0 -0
configuration_helmbert.py +4 -0
modeling_helmbert.py +147 -34
tokenization_helmbert.py +81 -30

__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (174 Bytes). View file

__pycache__/configuration_helmbert.cpython-311.pyc ADDED Viewed

Binary file (4.61 kB). View file

__pycache__/modeling_helmbert.cpython-311.pyc ADDED Viewed

Binary file (51 kB). View file

configuration_helmbert.py CHANGED Viewed

@@ -66,6 +66,8 @@ class HELMBertConfig(PretrainedConfig):
         # Classification/regression
         num_labels: int = 2,
         problem_type: str = None,
         **kwargs,
     ):
         super().__init__(
@@ -102,3 +104,5 @@ class HELMBertConfig(PretrainedConfig):
         # Classification/regression
         self.num_labels = num_labels
         self.problem_type = problem_type

         # Classification/regression
         num_labels: int = 2,
         problem_type: str = None,
+        classifier_num_layers: int = 0,
+        classifier_dropout: float = 0.1,
         **kwargs,
     ):
         super().__init__(
         # Classification/regression
         self.num_labels = num_labels
         self.problem_type = problem_type
+        self.classifier_num_layers = classifier_num_layers
+        self.classifier_dropout = classifier_dropout

modeling_helmbert.py CHANGED Viewed

@@ -56,7 +56,9 @@ class XSoftmax(torch.autograd.Function):
     """Masked Softmax optimized for memory efficiency."""
     @staticmethod
-    def forward(ctx, input: torch.Tensor, mask: Optional[torch.Tensor], dim: int) -> torch.Tensor:
         ctx.dim = dim
         if mask is not None:
             rmask = ~(mask.bool())
@@ -77,7 +79,9 @@ class XSoftmax(torch.autograd.Function):
     def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
         (output,) = ctx.saved_tensors
         if version.Version(torch.__version__) >= version.Version("1.11.0"):
-            input_grad = _softmax_backward_data(grad_output, output, ctx.dim, output.dtype)
         else:
             input_grad = _softmax_backward_data(grad_output, output, ctx.dim, output)
         return input_grad, None, None
@@ -104,11 +108,14 @@ def build_relative_position(
         max_exact = num_buckets // 4
         is_small = rel_pos < max_exact
-        rel_pos_if_large = max_exact + (
-            torch.log(rel_pos.float() / max_exact)
-            / math.log(max_position / max_exact)
-            * (num_buckets // 4 - 1)
-        ).long()
         rel_pos_if_large = torch.min(
             rel_pos_if_large, torch.full_like(rel_pos_if_large, num_buckets // 2 - 1)
         )
@@ -167,9 +174,13 @@ class DisentangledSelfAttention(nn.Module):
         # Position projections
         if not self.share_att_key:
             if "c2p" in self.pos_att_type or "p2p" in self.pos_att_type:
-                self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
             if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type:
-                self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
         # Dropout
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
@@ -223,7 +234,9 @@ class DisentangledSelfAttention(nn.Module):
                 attention_scores = attention_scores + rel_att
         # Normalize scores for numerical stability
-        attention_scores = attention_scores - attention_scores.max(dim=-1, keepdim=True)[0].detach()
         attention_scores = attention_scores.to(hidden_states.dtype)
         # Reshape for XSoftmax
@@ -236,11 +249,15 @@ class DisentangledSelfAttention(nn.Module):
         attention_probs = self.dropout(attention_probs)
         # Apply attention to values
-        attention_probs_flat = attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1))
         context_layer = torch.bmm(attention_probs_flat, value_layer)
         # Reshape output
-        context_layer = context_layer.view(-1, self.num_heads, context_layer.size(-2), context_layer.size(-1))
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_shape)
@@ -285,33 +302,49 @@ class DisentangledSelfAttention(nn.Module):
         ].unsqueeze(0)
         rel_embeddings = self.pos_dropout(rel_embeddings)
-        score = torch.zeros_like(query_layer[:, :, :1]).expand(-1, -1, key_layer.size(-2))
         # Prepare position indices
         c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
-        c2p_pos = c2p_pos.squeeze(0).expand(query_layer.size(0), query_layer.size(1), relative_pos.size(-1))
         # Content-to-position (c2p)
         if "c2p" in self.pos_att_type:
             pos_key_layer = (
-                self.pos_key_proj(rel_embeddings) if not self.share_att_key else self.key_proj(rel_embeddings)
             )
-            pos_key_layer = self.transpose_for_scores(pos_key_layer).repeat(batch_size, 1, 1)
             c2p_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
-            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2) * c2p_scale)
             c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos)
             score = score + c2p_att
         # Position-to-content (p2c)
         if "p2c" in self.pos_att_type:
             pos_query_layer = (
-                self.pos_query_proj(rel_embeddings) if not self.share_att_key else self.query_proj(rel_embeddings)
             )
-            pos_query_layer = self.transpose_for_scores(pos_query_layer).repeat(batch_size, 1, 1)
             p2c_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
-            p2c_att = torch.bmm(pos_query_layer * p2c_scale, key_layer.transpose(-1, -2))
             p2c_att = torch.gather(p2c_att, dim=-2, index=c2p_pos)
             score = score + p2c_att
@@ -331,7 +364,9 @@ class HELMBertEmbeddings(nn.Module):
         self.word_embeddings = nn.Embedding(
             config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
         )
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -399,7 +434,11 @@ class NgieLayer(nn.Module):
             Output with n-gram information incorporated
         """
         # Apply 1D convolution
-        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
         # Create reverse mask for padding
         if version.Version(torch.__version__) >= version.Version("1.2.0a"):
@@ -414,7 +453,9 @@ class NgieLayer(nn.Module):
         out = self.activation(self.dropout(out))
         # Residual connection with LayerNorm
-        output_states = masked_layer_norm(self.layer_norm, residual_states + out, attention_mask)
         return output_states
@@ -523,13 +564,17 @@ class HELMBertEncoder(nn.Module):
         self.ngie_layer = NgieLayer(config)
         # Transformer blocks
-        self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
     def get_rel_embedding(self) -> Optional[torch.Tensor]:
         """Get relative position embeddings from first layer."""
         if len(self.layers) > 0:
             first_layer = self.layers[0]
-            if hasattr(first_layer, "self_attn") and hasattr(first_layer.self_attn, "rel_embeddings"):
                 return first_layer.self_attn.rel_embeddings.weight
         return None
@@ -589,7 +634,9 @@ class HELMBertEncoder(nn.Module):
             # Apply nGiE after first layer
             if layer_idx == 0:
                 hidden_states_batch = hidden_states.transpose(0, 1)
-                hidden_states_batch = self.ngie_layer(ngie_input_states, hidden_states_batch, attention_mask)
                 hidden_states = hidden_states_batch.transpose(0, 1)
             # Store layer[-2] for EMD
@@ -647,7 +694,9 @@ class HELMBertPooler(nn.Module):
             Pooled output [batch, hidden]
         """
         if attention_mask is not None:
-            mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
             sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
             eps = torch.finfo(hidden_states.dtype).eps
             sum_mask = torch.clamp(mask_expanded.sum(1), min=eps)
@@ -858,7 +907,9 @@ class HELMBertForMaskedLM(HELMBertPreTrainedModel):
             attention_mask = torch.ones_like(input_ids)
         # Embeddings
-        embeddings, position_embeddings = self.helmbert.embeddings(input_ids, attention_mask)
         # Encoder with optional EMD
         encoder_outputs = self.helmbert.encoder(
@@ -886,7 +937,9 @@ class HELMBertForMaskedLM(HELMBertPreTrainedModel):
         loss = None
         if labels is not None:
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
         if not return_dict:
             output = (prediction_scores, hidden_states, attentions)
@@ -900,12 +953,57 @@ class HELMBertForMaskedLM(HELMBertPreTrainedModel):
         )
 class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
     """HELM-BERT for sequence classification/regression.
     Example:
         >>> from helmbert import HELMBertForSequenceClassification, HELMBertConfig
-        >>> config = HELMBertConfig(num_labels=1)  # Regression
         >>> model = HELMBertForSequenceClassification(config)
     """
@@ -915,8 +1013,19 @@ class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
         self.config = config
         self.helmbert = HELMBertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         self.post_init()
@@ -951,7 +1060,9 @@ class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
         )
         pooled_output = outputs.pooler_output
-        pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         loss = None
@@ -959,7 +1070,9 @@ class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"

     """Masked Softmax optimized for memory efficiency."""
     @staticmethod
+    def forward(
+        ctx, input: torch.Tensor, mask: Optional[torch.Tensor], dim: int
+    ) -> torch.Tensor:
         ctx.dim = dim
         if mask is not None:
             rmask = ~(mask.bool())
     def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
         (output,) = ctx.saved_tensors
         if version.Version(torch.__version__) >= version.Version("1.11.0"):
+            input_grad = _softmax_backward_data(
+                grad_output, output, ctx.dim, output.dtype
+            )
         else:
             input_grad = _softmax_backward_data(grad_output, output, ctx.dim, output)
         return input_grad, None, None
         max_exact = num_buckets // 4
         is_small = rel_pos < max_exact
+        rel_pos_if_large = (
+            max_exact
+            + (
+                torch.log(rel_pos.float() / max_exact)
+                / math.log(max_position / max_exact)
+                * (num_buckets // 4 - 1)
+            ).long()
+        )
         rel_pos_if_large = torch.min(
             rel_pos_if_large, torch.full_like(rel_pos_if_large, num_buckets // 2 - 1)
         )
         # Position projections
         if not self.share_att_key:
             if "c2p" in self.pos_att_type or "p2p" in self.pos_att_type:
+                self.pos_key_proj = nn.Linear(
+                    config.hidden_size, self.all_head_size, bias=True
+                )
             if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type:
+                self.pos_query_proj = nn.Linear(
+                    config.hidden_size, self.all_head_size, bias=False
+                )
         # Dropout
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
                 attention_scores = attention_scores + rel_att
         # Normalize scores for numerical stability
+        attention_scores = (
+            attention_scores - attention_scores.max(dim=-1, keepdim=True)[0].detach()
+        )
         attention_scores = attention_scores.to(hidden_states.dtype)
         # Reshape for XSoftmax
         attention_probs = self.dropout(attention_probs)
         # Apply attention to values
+        attention_probs_flat = attention_probs.view(
+            -1, attention_probs.size(-2), attention_probs.size(-1)
+        )
         context_layer = torch.bmm(attention_probs_flat, value_layer)
         # Reshape output
+        context_layer = context_layer.view(
+            -1, self.num_heads, context_layer.size(-2), context_layer.size(-1)
+        )
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_shape)
         ].unsqueeze(0)
         rel_embeddings = self.pos_dropout(rel_embeddings)
+        score = torch.zeros_like(query_layer[:, :, :1]).expand(
+            -1, -1, key_layer.size(-2)
+        )
         # Prepare position indices
         c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+        c2p_pos = c2p_pos.squeeze(0).expand(
+            query_layer.size(0), query_layer.size(1), relative_pos.size(-1)
+        )
         # Content-to-position (c2p)
         if "c2p" in self.pos_att_type:
             pos_key_layer = (
+                self.pos_key_proj(rel_embeddings)
+                if not self.share_att_key
+                else self.key_proj(rel_embeddings)
+            )
+            pos_key_layer = self.transpose_for_scores(pos_key_layer).repeat(
+                batch_size, 1, 1
             )
             c2p_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
+            c2p_att = torch.bmm(
+                query_layer, pos_key_layer.transpose(-1, -2) * c2p_scale
+            )
             c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos)
             score = score + c2p_att
         # Position-to-content (p2c)
         if "p2c" in self.pos_att_type:
             pos_query_layer = (
+                self.pos_query_proj(rel_embeddings)
+                if not self.share_att_key
+                else self.query_proj(rel_embeddings)
+            )
+            pos_query_layer = self.transpose_for_scores(pos_query_layer).repeat(
+                batch_size, 1, 1
             )
             p2c_scale = 1.0 / math.sqrt(self.head_size * scale_factor)
+            p2c_att = torch.bmm(
+                pos_query_layer * p2c_scale, key_layer.transpose(-1, -2)
+            )
             p2c_att = torch.gather(p2c_att, dim=-2, index=c2p_pos)
             score = score + p2c_att
         self.word_embeddings = nn.Embedding(
             config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
         )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
         self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
             Output with n-gram information incorporated
         """
         # Apply 1D convolution
+        out = (
+            self.conv(hidden_states.permute(0, 2, 1).contiguous())
+            .permute(0, 2, 1)
+            .contiguous()
+        )
         # Create reverse mask for padding
         if version.Version(torch.__version__) >= version.Version("1.2.0a"):
         out = self.activation(self.dropout(out))
         # Residual connection with LayerNorm
+        output_states = masked_layer_norm(
+            self.layer_norm, residual_states + out, attention_mask
+        )
         return output_states
         self.ngie_layer = NgieLayer(config)
         # Transformer blocks
+        self.layers = nn.ModuleList(
+            [TransformerBlock(config) for _ in range(config.num_hidden_layers)]
+        )
     def get_rel_embedding(self) -> Optional[torch.Tensor]:
         """Get relative position embeddings from first layer."""
         if len(self.layers) > 0:
             first_layer = self.layers[0]
+            if hasattr(first_layer, "self_attn") and hasattr(
+                first_layer.self_attn, "rel_embeddings"
+            ):
                 return first_layer.self_attn.rel_embeddings.weight
         return None
             # Apply nGiE after first layer
             if layer_idx == 0:
                 hidden_states_batch = hidden_states.transpose(0, 1)
+                hidden_states_batch = self.ngie_layer(
+                    ngie_input_states, hidden_states_batch, attention_mask
+                )
                 hidden_states = hidden_states_batch.transpose(0, 1)
             # Store layer[-2] for EMD
             Pooled output [batch, hidden]
         """
         if attention_mask is not None:
+            mask_expanded = (
+                attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
+            )
             sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
             eps = torch.finfo(hidden_states.dtype).eps
             sum_mask = torch.clamp(mask_expanded.sum(1), min=eps)
             attention_mask = torch.ones_like(input_ids)
         # Embeddings
+        embeddings, position_embeddings = self.helmbert.embeddings(
+            input_ids, attention_mask
+        )
         # Encoder with optional EMD
         encoder_outputs = self.helmbert.encoder(
         loss = None
         if labels is not None:
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
         if not return_dict:
             output = (prediction_scores, hidden_states, attentions)
         )
+class MLPHead(nn.Module):
+    """MLP head with skip connections for classification/regression.
+    Architecture: input -> [Linear -> GELU -> LayerNorm -> Dropout (+ skip)] x N -> Linear -> output
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        hidden_dims: list,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        self.norms = nn.ModuleList()
+        self.dropouts = nn.ModuleList()
+        prev_dim = input_dim
+        for hidden_dim in hidden_dims:
+            self.layers.append(nn.Linear(prev_dim, hidden_dim))
+            self.norms.append(nn.LayerNorm(hidden_dim))
+            self.dropouts.append(nn.Dropout(dropout))
+            prev_dim = hidden_dim
+        self.output_layer = nn.Linear(prev_dim, output_dim)
+        self.activation = nn.GELU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer, norm, dropout in zip(self.layers, self.norms, self.dropouts):
+            identity = x
+            x = layer(x)
+            if x.shape == identity.shape:
+                x = x + identity  # Skip connection
+            x = self.activation(x)
+            x = norm(x)
+            x = dropout(x)
+        return self.output_layer(x)
 class HELMBertForSequenceClassification(HELMBertPreTrainedModel):
     """HELM-BERT for sequence classification/regression.
     Example:
         >>> from helmbert import HELMBertForSequenceClassification, HELMBertConfig
+        >>> # Simple linear head (default)
+        >>> config = HELMBertConfig(num_labels=1)
+        >>> model = HELMBertForSequenceClassification(config)
+        >>>
+        >>> # MLP head with 2 layers (for permeability prediction)
+        >>> config = HELMBertConfig(num_labels=1, classifier_num_layers=2)
         >>> model = HELMBertForSequenceClassification(config)
     """
         self.config = config
         self.helmbert = HELMBertModel(config)
+        # Use MLP head if num_layers > 0, otherwise simple linear
+        if config.classifier_num_layers > 0:
+            hidden_dims = [config.hidden_size] * config.classifier_num_layers
+            self.classifier = MLPHead(
+                input_dim=config.hidden_size,
+                output_dim=config.num_labels,
+                hidden_dims=hidden_dims,
+                dropout=config.classifier_dropout,
+            )
+        else:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+            self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         self.post_init()
         )
         pooled_output = outputs.pooler_output
+        # MLP head has internal dropout, simple linear needs separate dropout
+        if hasattr(self, "dropout"):
+            pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         loss = None
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"

tokenization_helmbert.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import json
 import os
-from typing import Dict, List, Optional, Tuple, Union
 from transformers import PreTrainedTokenizer
@@ -10,43 +10,89 @@ from transformers import PreTrainedTokenizer
 # Default vocabulary for HELM notation
 HELM_VOCAB = {
     # Special tokens (0-4)
-    " ": 0,   # PAD
-    "@": 1,   # BOS/CLS
     "\n": 2,  # EOS/SEP
-    "§": 3,   # UNK
-    "¶": 4,   # MASK
     # Natural amino acids (5-25)
-    "A": 5, "R": 6, "N": 7, "D": 8, "C": 9,
-    "E": 10, "Q": 11, "G": 12, "H": 13, "I": 14,
-    "L": 15, "K": 16, "M": 17, "F": 18, "P": 19,
-    "S": 20, "T": 21, "W": 22, "Y": 23, "V": 24,
     "X": 25,  # Unknown amino acid
     # Structure symbols (26-37)
-    "[": 26, "]": 27, "{": 28, "}": 29, "(": 30, ")": 31,
-    "$": 32, ",": 33, ":": 34, "|": 35, "-": 36, ".": 37,
     # Numbers (38-47)
-    "0": 38, "1": 39, "2": 40, "3": 41, "4": 42,
-    "5": 43, "6": 44, "7": 45, "8": 46, "9": 47,
     # Uppercase non-amino acids (48-50)
-    "B": 48, "O": 49, ">": 50,
     # Lowercase letters (51-72)
-    "a": 51, "b": 52, "c": 53, "d": 54, "e": 55,
-    "f": 56, "g": 57, "h": 58, "i": 59, "l": 60,
-    "m": 61, "n": 62, "o": 63, "p": 64, "r": 65,
-    "s": 66, "t": 67, "u": 68, "v": 69, "x": 70,
-    "y": 71, "z": 72,
     # Encoded polymer markers (73-76)
-    "/": 73,   # PEPTIDE
-    "*": 74,   # me
     "\t": 75,  # am
-    "&": 76,   # ac
     # Miscellaneous (77)
     "_": 77,
 }
@@ -227,7 +273,12 @@ class HELMBertTokenizer(PreTrainedTokenizer):
             List of 0s and 1s (1 = special token)
         """
         if already_has_special_tokens:
-            return [1 if x in [self.cls_token_id, self.sep_token_id, self.pad_token_id] else 0 for x in token_ids_0]
         if token_ids_1 is None:
             return [1] + [0] * len(token_ids_0) + [1]

 import json
 import os
+from typing import Dict, List, Optional, Tuple
 from transformers import PreTrainedTokenizer
 # Default vocabulary for HELM notation
 HELM_VOCAB = {
     # Special tokens (0-4)
+    " ": 0,  # PAD
+    "@": 1,  # BOS/CLS
     "\n": 2,  # EOS/SEP
+    "§": 3,  # UNK
+    "¶": 4,  # MASK
     # Natural amino acids (5-25)
+    "A": 5,
+    "R": 6,
+    "N": 7,
+    "D": 8,
+    "C": 9,
+    "E": 10,
+    "Q": 11,
+    "G": 12,
+    "H": 13,
+    "I": 14,
+    "L": 15,
+    "K": 16,
+    "M": 17,
+    "F": 18,
+    "P": 19,
+    "S": 20,
+    "T": 21,
+    "W": 22,
+    "Y": 23,
+    "V": 24,
     "X": 25,  # Unknown amino acid
     # Structure symbols (26-37)
+    "[": 26,
+    "]": 27,
+    "{": 28,
+    "}": 29,
+    "(": 30,
+    ")": 31,
+    "$": 32,
+    ",": 33,
+    ":": 34,
+    "|": 35,
+    "-": 36,
+    ".": 37,
     # Numbers (38-47)
+    "0": 38,
+    "1": 39,
+    "2": 40,
+    "3": 41,
+    "4": 42,
+    "5": 43,
+    "6": 44,
+    "7": 45,
+    "8": 46,
+    "9": 47,
     # Uppercase non-amino acids (48-50)
+    "B": 48,
+    "O": 49,
+    ">": 50,
     # Lowercase letters (51-72)
+    "a": 51,
+    "b": 52,
+    "c": 53,
+    "d": 54,
+    "e": 55,
+    "f": 56,
+    "g": 57,
+    "h": 58,
+    "i": 59,
+    "l": 60,
+    "m": 61,
+    "n": 62,
+    "o": 63,
+    "p": 64,
+    "r": 65,
+    "s": 66,
+    "t": 67,
+    "u": 68,
+    "v": 69,
+    "x": 70,
+    "y": 71,
+    "z": 72,
     # Encoded polymer markers (73-76)
+    "/": 73,  # PEPTIDE
+    "*": 74,  # me
     "\t": 75,  # am
+    "&": 76,  # ac
     # Miscellaneous (77)
     "_": 77,
 }
             List of 0s and 1s (1 = special token)
         """
         if already_has_special_tokens:
+            return [
+                1
+                if x in [self.cls_token_id, self.sep_token_id, self.pad_token_id]
+                else 0
+                for x in token_ids_0
+            ]
         if token_ids_1 is None:
             return [1] + [0] * len(token_ids_0) + [1]