Add missing base Bertose source file

Browse files

Files changed (3) hide show

README.md +1 -0
SHA256SUMS +26 -1
src/glycan_bert.py +303 -0

README.md CHANGED Viewed

@@ -22,6 +22,7 @@ This repository contains the contrastive Bertose checkpoint used to score ambigu
 - `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary.
 - `vocab/bpe_ambiguity_tokens.json` - ambiguous BPE token map used by the resolver.
 - `src/multimodal_glycan_bert_v3.py` - model definition.
 - `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
 ## Expected Input

 - `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary.
 - `vocab/bpe_ambiguity_tokens.json` - ambiguous BPE token map used by the resolver.
 - `src/multimodal_glycan_bert_v3.py` - model definition.
+- `src/glycan_bert.py` - base BERT layers used by the multimodal model.
 - `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
 ## Expected Input

SHA256SUMS CHANGED Viewed

@@ -1,8 +1,33 @@
 622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5  ./.gitattributes
-266caeb2fb9b68076343b40da91116dca0f2302f03cf28c2332b80b1a69c1758  ./README.md
 ae468f4e8c06dc0c3848138a474dc43249aa6d14dfd0df8f58d68fcaad371152  ./checkpoints/best_v51_contrastive_model.pt
 daf55c190fece0678064e41697a9545592beb1285f8aa74e595b933b9d37b4c2  ./config.json
 6a56e6f73b8f874470ecde6e538f3f5029ae23aa6c10559817d1c2a8b59b7c0f  ./requirements.txt
 0d9ce16bf90242f38621d64cd974ea5679bff4c2013bea8d7bffe1b8dd120794  ./src/multimodal_glycan_bert_v3.py
 0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839  ./src/wurcs_bpe_tokenizer.py
 c68cd003370b2dcdb162f848f766e4e62f2653c6c38d205f8cbe53a9aabe2d74  ./vocab/bpe_ambiguity_tokens.json

+684888c0ebb17f374298b65ee2807526c066094c701bcc7ebbe1c1095f494fc1  ./.cache/huggingface/.gitignore
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/.gitattributes.lock
+3098e38608a2c2375ac1f78d4c4f52680796f4ff9c0dbaad6b4f0b110fbc7fc3  ./.cache/huggingface/upload/.gitattributes.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/README.md.lock
+ecc75cccadd48cf2cc8d22daec846b6a760f492162ca145c4cfef3536dafcc2a  ./.cache/huggingface/upload/README.md.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/SHA256SUMS.lock
+aa2c2e921401dba265bdd190a662861cffd8ff05eaf6ae45a96a25385bd6c5e4  ./.cache/huggingface/upload/SHA256SUMS.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/checkpoints/best_v51_contrastive_model.pt.lock
+0bc5904fe02b6a64df35829729c29d40f0c0a795d586b10d844fbee91e6fa0e7  ./.cache/huggingface/upload/checkpoints/best_v51_contrastive_model.pt.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/config.json.lock
+9370200adedd2172ffd8459528e7fd47c5913bf9e791f5b731b0e16121ca3ebf  ./.cache/huggingface/upload/config.json.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/requirements.txt.lock
+fef169fb7e8af9c14c21240bb9034cd567bd18dc327ab39423d68ba3b2ee413a  ./.cache/huggingface/upload/requirements.txt.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/src/multimodal_glycan_bert_v3.py.lock
+65dcbe6e66d8bba618e4d22209bd2e83b73b5de767b892c1bbd43db1c9326f42  ./.cache/huggingface/upload/src/multimodal_glycan_bert_v3.py.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/src/wurcs_bpe_tokenizer.py.lock
+28ca0e31a94c80afc124627b62a574125270a5f269bdff012fd36b465578dc82  ./.cache/huggingface/upload/src/wurcs_bpe_tokenizer.py.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/vocab/bpe_ambiguity_tokens.json.lock
+eb200fe67e613751c0571950e9a7f22f9f44fde0f85b73a40d392189a203f465  ./.cache/huggingface/upload/vocab/bpe_ambiguity_tokens.json.metadata
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.lock
+c00560217b399adfb341aacc38053299c7d4b33b4229e89e68275cd454bb7f5b  ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.metadata
 622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5  ./.gitattributes
+21912ebe4c2b720eac3164c3628f37a39d6c918221c84e04b76a914fd709752d  ./README.md
 ae468f4e8c06dc0c3848138a474dc43249aa6d14dfd0df8f58d68fcaad371152  ./checkpoints/best_v51_contrastive_model.pt
 daf55c190fece0678064e41697a9545592beb1285f8aa74e595b933b9d37b4c2  ./config.json
 6a56e6f73b8f874470ecde6e538f3f5029ae23aa6c10559817d1c2a8b59b7c0f  ./requirements.txt
+789fde2ce01f83a5bb363aee29fe33809e2a7015c47c1915655c208d8beec496  ./src/__pycache__/glycan_bert.cpython-312.pyc
+9a0d7855e244b3a1ff369eba4da5303d528f067d1092fefd5a93c9db164de000  ./src/__pycache__/multimodal_glycan_bert_v3.cpython-312.pyc
+62259d1fe3d8736e57cadf8ce5a8bf24a7b73368d4d653c2e0d56ac94b94fe76  ./src/__pycache__/wurcs_bpe_tokenizer.cpython-312.pyc
+b69f14c9976951325e3a0a4e8107a16126e67d410e966650f513f1f538a732bb  ./src/glycan_bert.py
 0d9ce16bf90242f38621d64cd974ea5679bff4c2013bea8d7bffe1b8dd120794  ./src/multimodal_glycan_bert_v3.py
 0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839  ./src/wurcs_bpe_tokenizer.py
 c68cd003370b2dcdb162f848f766e4e62f2653c6c38d205f8cbe53a9aabe2d74  ./vocab/bpe_ambiguity_tokens.json

src/glycan_bert.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+Glycan BERT Model
+Transformer-based masked language model for glycan structures.
+Based on BERT/ESM2 architecture adapted for atomic glycan tokenization.
+"""
+import torch
+import torch.nn as nn
+import math
+class GlycanBERTConfig:
+    """Configuration for GlycanBERT."""
+    def __init__(
+        self,
+        vocab_size: int = 102,
+        hidden_size: int = 384,
+        num_hidden_layers: int = 6,
+        num_attention_heads: int = 6,
+        intermediate_size: int = 1536,
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        layer_norm_eps: float = 1e-12,
+        pad_token_id: int = 0,
+        mask_token_id: int = 4,
+        initializer_range: float = 0.02
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+        self.initializer_range = initializer_range
+class GlycanBERTEmbeddings(nn.Module):
+    """
+    Embeddings for glycan tokens including token and positional embeddings.
+    """
+    def __init__(self, config: GlycanBERTConfig):
+        super().__init__()
+        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, max_seq_len) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input_ids: Tensor of shape (batch_size, seq_len)
+        Returns:
+            Embeddings of shape (batch_size, seq_len, hidden_size)
+        """
+        batch_size, seq_len = input_ids.shape
+        # Token embeddings
+        token_embeds = self.token_embeddings(input_ids)
+        # Position embeddings
+        position_ids = self.position_ids[:, :seq_len]
+        position_embeds = self.position_embeddings(position_ids)
+        # Combine
+        embeddings = token_embeds + position_embeds
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class GlycanBERTAttention(nn.Module):
+    """Multi-head self-attention."""
+    def __init__(self, config: GlycanBERTConfig):
+        super().__init__()
+        assert config.hidden_size % config.num_attention_heads == 0
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        """Reshape for multi-head attention."""
+        new_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_shape)
+        return x.permute(0, 2, 1, 3)  # (batch, heads, seq_len, head_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: (batch_size, seq_len, hidden_size)
+            attention_mask: (batch_size, seq_len) - 1 for valid, 0 for padding
+        Returns:
+            Attention output: (batch_size, seq_len, hidden_size)
+        """
+        batch_size, seq_len, _ = hidden_states.shape
+        # Linear projections
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        # Attention scores
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply attention mask
+        if attention_mask is not None:
+            # Convert mask to additive mask
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # (batch, 1, 1, seq_len)
+            attention_mask = (1.0 - attention_mask) * -10000.0
+            attention_scores = attention_scores + attention_mask
+        # Attention probabilities
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        attention_probs = self.dropout(attention_probs)
+        # Apply attention to values
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # Reshape back
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_shape)
+        return context_layer
+class GlycanBERTLayer(nn.Module):
+    """Single transformer layer."""
+    def __init__(self, config: GlycanBERTConfig):
+        super().__init__()
+        self.attention = GlycanBERTAttention(config)
+        self.attention_output = nn.Linear(config.hidden_size, config.hidden_size)
+        self.attention_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.intermediate = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.output = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: (batch_size, seq_len, hidden_size)
+            attention_mask: (batch_size, seq_len)
+        Returns:
+            Output: (batch_size, seq_len, hidden_size)
+        """
+        # Self-attention
+        attention_output = self.attention(hidden_states, attention_mask)
+        attention_output = self.attention_output(attention_output)
+        attention_output = self.dropout(attention_output)
+        # Add & Norm
+        hidden_states = self.attention_layer_norm(hidden_states + attention_output)
+        # Feed-forward
+        intermediate_output = self.intermediate(hidden_states)
+        intermediate_output = nn.functional.gelu(intermediate_output)
+        layer_output = self.output(intermediate_output)
+        layer_output = self.dropout(layer_output)
+        # Add & Norm
+        layer_output = self.output_layer_norm(hidden_states + layer_output)
+        return layer_output
+class GlycanBERT(nn.Module):
+    """
+    Glycan BERT model for masked language modeling.
+    """
+    def __init__(self, config: GlycanBERTConfig):
+        super().__init__()
+        self.config = config
+        # Embeddings
+        self.embeddings = GlycanBERTEmbeddings(config)
+        # Transformer layers
+        self.layers = nn.ModuleList([GlycanBERTLayer(config) for _ in range(config.num_hidden_layers)])
+        # MLM head
+        self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """Initialize weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None
+    ):
+        """
+        Args:
+            input_ids: (batch_size, seq_len)
+            attention_mask: (batch_size, seq_len) - 1 for valid, 0 for padding
+            labels: (batch_size, seq_len) - token IDs to predict, -100 for positions to ignore
+        Returns:
+            If labels provided: (loss, logits)
+            Else: logits
+        """
+        # Create attention mask if not provided
+        if attention_mask is None:
+            attention_mask = (input_ids != self.config.pad_token_id).float()
+        # Embeddings
+        hidden_states = self.embeddings(input_ids)
+        # Transformer layers
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+        # MLM prediction
+        logits = self.mlm_head(hidden_states)
+        # Calculate loss if labels provided
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 is ignored
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+        if loss is not None:
+            return loss, logits
+        return logits
+    def get_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Get contextualized embeddings (for downstream tasks).
+        Args:
+            input_ids: (batch_size, seq_len)
+            attention_mask: (batch_size, seq_len)
+        Returns:
+            Embeddings: (batch_size, seq_len, hidden_size)
+        """
+        if attention_mask is None:
+            attention_mask = (input_ids != self.config.pad_token_id).float()
+        hidden_states = self.embeddings(input_ids)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+        return hidden_states