Polish Affinose release source

Browse files

Files changed (5) hide show

SHA256SUMS +4 -4
src/affinose_dataset.py +6 -7
src/affinose_model.py +3 -3
src/bertose_layers.py +4 -6
src/bertose_model.py +19 -20

SHA256SUMS CHANGED Viewed

@@ -3,10 +3,10 @@ f474c23adc30c94a8f6867c8260213eddb07c9732ab285078f2f08a9ad9fd062  ./README.md
 533fe4f9317782e39ad7980caa0f47ad92f1be999dd9489425a9429e2e7c15cb  ./checkpoints/affinose_interaction_model.pt
 043fcb1c7eb97e22fefe8fadadeff97b56ede254b95e318553175837a1e57114  ./config.json
 1be88f15fd905882c711f7ceb59b619a93f9cee2c6c7c031f8dbabd35b29e9e0  ./requirements.txt
-3efc954aff46a0017146e60a803f26b7fa15ded4bcba66653ae443456ae5ab2d  ./src/affinose_dataset.py
 6f83790933a2e4f10160abdca9a6db1a4407ded780abf1d71407963a556dedb6  ./src/affinose_inference.py
-eda11ff25cf5bf34ebcb1dd0a81c9184314c66ef9664f46e8824f6a1e7769217  ./src/affinose_model.py
-b69f14c9976951325e3a0a4e8107a16126e67d410e966650f513f1f538a732bb  ./src/bertose_layers.py
-f247a6c09132a61cb649acfe022b269b5b94c37a5069fcb62045f3340b96b191  ./src/bertose_model.py
 0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839  ./src/wurcs_bpe_tokenizer.py
 6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc  ./vocab/bpe_vocabulary.json

 533fe4f9317782e39ad7980caa0f47ad92f1be999dd9489425a9429e2e7c15cb  ./checkpoints/affinose_interaction_model.pt
 043fcb1c7eb97e22fefe8fadadeff97b56ede254b95e318553175837a1e57114  ./config.json
 1be88f15fd905882c711f7ceb59b619a93f9cee2c6c7c031f8dbabd35b29e9e0  ./requirements.txt
+175811ef7be90383787858fafbb929d7a87e039bc0185d4d0f6d216dc92a48ed  ./src/affinose_dataset.py
 6f83790933a2e4f10160abdca9a6db1a4407ded780abf1d71407963a556dedb6  ./src/affinose_inference.py
+de14de370a77237ef2ac5c88714c83a54ce3f696efb710f93f8be19106c7fa95  ./src/affinose_model.py
+6362da8e8de0dc4d580c7d94ef6ab1dbc737da13127fc4078681ce6315180086  ./src/bertose_layers.py
+3c5b826fcf5850749f74d980eee48d0595557f3d6e2a58aa873902817eb65c64  ./src/bertose_model.py
 0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839  ./src/wurcs_bpe_tokenizer.py
 6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc  ./vocab/bpe_vocabulary.json

src/affinose_dataset.py CHANGED Viewed

@@ -48,16 +48,15 @@ def load_bpe_tokenizer(vocab_path: str):
     utils_dir = None
     for root in candidate_roots:
-        candidate = root / "bert_training_v4" / "downstream_tasks" / "utils"
-        if candidate.exists():
-            utils_dir = candidate
             break
     if utils_dir is None:
-        utils_dir = Path(
-            "/work/ratul1/supantha/glycan-SD-VS/bert_training_v3/"
-            "v3.1_cluster_training/bert_training_v4/downstream_tasks/utils"
-        )
     utils_dir = str(utils_dir)
     if utils_dir not in sys.path:

     utils_dir = None
     for root in candidate_roots:
+        for candidate in [root, root / "src", root / "downstream_tasks" / "utils"]:
+            if (candidate / "wurcs_bpe_tokenizer.py").exists():
+                utils_dir = candidate
+                break
+        if utils_dir is not None:
             break
     if utils_dir is None:
+        utils_dir = Path(__file__).resolve().parent
     utils_dir = str(utils_dir)
     if utils_dir not in sys.path:

src/affinose_model.py CHANGED Viewed

@@ -40,10 +40,10 @@ def _default_bertose_root() -> Path:
     here = Path(__file__).resolve()
     for parent in here.parents:
-        if (parent / "bert_training_v4").exists() and (parent / "model").exists():
             return parent
-    return Path("/work/ratul1/supantha/glycan-SD-VS/bert_training_v3/v3.1_cluster_training")
 BERTOSE_ROOT = _default_bertose_root()
@@ -55,7 +55,7 @@ def _ensure_bertose_imports():
     roots = [
         str(source_dir),
         str(BERTOSE_ROOT),
-        str(BERTOSE_ROOT / "bert_training_v4"),
     ]
     for root in roots:
         if root not in sys.path:

     here = Path(__file__).resolve()
     for parent in here.parents:
+        if (parent / "src").exists() or (parent / "bertose_model.py").exists():
             return parent
+    return here.parent
 BERTOSE_ROOT = _default_bertose_root()
     roots = [
         str(source_dir),
         str(BERTOSE_ROOT),
+        str(BERTOSE_ROOT / "src"),
     ]
     for root in roots:
         if root not in sys.path:

src/bertose_layers.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
-Glycan BERT Model
-Transformer-based masked language model for glycan structures.
-Based on BERT/ESM2 architecture adapted for atomic glycan tokenization.
 """
 import torch
@@ -11,7 +10,7 @@ import math
 class GlycanBERTConfig:
-    """Configuration for GlycanBERT."""
     def __init__(
         self,
@@ -203,7 +202,7 @@ class GlycanBERTLayer(nn.Module):
 class GlycanBERT(nn.Module):
     """
-    Glycan BERT model for masked language modeling.
     """
     def __init__(self, config: GlycanBERTConfig):
@@ -300,4 +299,3 @@ class GlycanBERT(nn.Module):
             hidden_states = layer(hidden_states, attention_mask)
         return hidden_states

 """
+Bertose transformer layers.
+Transformer blocks adapted for WURCS glycan tokenization.
 """
 import torch
 class GlycanBERTConfig:
+    """Configuration for the Bertose transformer stack."""
     def __init__(
         self,
 class GlycanBERT(nn.Module):
     """
+    Bertose transformer stack for masked language modeling.
     """
     def __init__(self, config: GlycanBERTConfig):
             hidden_states = layer(hidden_states, attention_mask)
         return hidden_states

src/bertose_model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
-Multimodal Glycan BERT Model v3
-Extends GlycanBERT to handle three modalities:
 - Sequence (WURCS atomic tokenization)
 - MS (mass spectrometry peaks, RT, intensity)
 - 3D structure (VQ-VAE discrete tokens, 4 per residue)
@@ -40,11 +40,11 @@ class ConvGlycanBERTEmbeddings(nn.Module):
             config.max_position_embeddings, config.hidden_size
         )
-        # NEW: Branch depth embeddings - encodes depth in glycan tree (0=root, 1=child, etc.)
         max_branch_depth = getattr(config, "max_branch_depth", 8)
         self.branch_embeddings = nn.Embedding(max_branch_depth, config.hidden_size)
-        # NEW: Linkage type embeddings - encodes chemistry of glycosidic bond
         # 0=none, 1=1-3, 2=1-4, 3=1-6, etc.
         num_linkage_types = getattr(config, "num_linkage_types", 9)
         self.linkage_embeddings = nn.Embedding(num_linkage_types, config.hidden_size)
@@ -83,13 +83,13 @@ class ConvGlycanBERTEmbeddings(nn.Module):
         position_ids = self.position_ids[:, :seq_len]
         x = x + self.position_embeddings(position_ids)
-        # NEW: Add branch depth embeddings (encodes tree structure)
         if branch_depths is not None:
             # Clamp to valid range
             branch_depths = branch_depths.clamp(0, self.branch_embeddings.num_embeddings - 1)
             x = x + self.branch_embeddings(branch_depths)
-        # NEW: Add linkage type embeddings (encodes bond chemistry)
         if linkage_types is not None:
             linkage_types = linkage_types.clamp(0, self.linkage_embeddings.num_embeddings - 1)
             x = x + self.linkage_embeddings(linkage_types)
@@ -148,7 +148,7 @@ def create_residue_level_mask(
 class MultimodalGlycanBERTConfig:
-    """Configuration for Multimodal GlycanBERT v3."""
     def __init__(
         self,
@@ -247,7 +247,7 @@ class MultimodalGlycanBERTConfig:
         self.seq_loss_weight = seq_loss_weight
         self.ms_loss_weight = ms_loss_weight
         self.struct_loss_weight = struct_loss_weight
-        self.dist_loss_weight = 0.25  # NEW: Topology loss weight (default, can override from config)
         # Token IDs
         self.pad_token_id = pad_token_id
@@ -610,7 +610,7 @@ class CrossAttentionLayer(nn.Module):
 class MultimodalGlycanBERT(nn.Module):
     """
-    Multimodal BERT for glycan representation learning (v3).
     Architecture:
     1. Separate encoders for each modality (sequence, MS, 3D structure)
@@ -628,7 +628,7 @@ class MultimodalGlycanBERT(nn.Module):
         seq_config.cnn_kernel_size = config.cnn_kernel_size
         if config.use_cnn_frontend:
-            print(f"✅ Enabled Convolutional Front-End (Kernel={config.cnn_kernel_size})")
             self.seq_embeddings = ConvGlycanBERTEmbeddings(seq_config)
         else:
             self.seq_embeddings = GlycanBERTEmbeddings(seq_config)
@@ -675,7 +675,7 @@ class MultimodalGlycanBERT(nn.Module):
         )
         # ===== Distance Prediction Head (Topology) =====
-        # OPTIMIZED: Project down to 128 dim first to save GPU memory
         # (Batch, 256, 256, 768) -> (Batch, 256, 256, 128) reduces memory by 6x
         self.dist_proj = nn.Linear(config.seq_hidden_size, 128)
         self.distance_head = nn.Sequential(
@@ -706,8 +706,8 @@ class MultimodalGlycanBERT(nn.Module):
         seq_token_ids: torch.Tensor,
         seq_attention_mask: torch.Tensor,
         seq_residue_ids: torch.Tensor,
-        seq_branch_depths: Optional[torch.Tensor] = None,  # NEW: Branch depths
-        seq_linkage_types: Optional[torch.Tensor] = None,  # NEW: Linkage types
         ms_token_ids: torch.Tensor = None,
         ms_attention_mask: torch.Tensor = None,
         has_ms: torch.Tensor = None,
@@ -718,11 +718,11 @@ class MultimodalGlycanBERT(nn.Module):
         seq_labels: Optional[torch.Tensor] = None,
         ms_labels: Optional[torch.Tensor] = None,
         struct_labels: Optional[torch.Tensor] = None,
-        dist_labels: Optional[torch.Tensor] = None,  # NEW: Topology distance labels
         return_dict: bool = True,
     ) -> Dict[str, torch.Tensor]:
         """
-        Forward pass for multimodal BERT v3.
         Args:
             seq_token_ids: (batch_size, seq_len) - Sequence token IDs
@@ -839,7 +839,7 @@ class MultimodalGlycanBERT(nn.Module):
         seq_loss = None
         ms_loss = None
         struct_loss = None
-        dist_loss = None  # NEW: Topology distance loss
         if seq_labels is not None:
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
@@ -921,11 +921,11 @@ class MultimodalGlycanBERT(nn.Module):
                 'seq_loss': seq_loss,
                 'ms_loss': ms_loss,
                 'struct_loss': struct_loss,
-                'dist_loss': dist_loss,  # NEW: Topology loss
                 'seq_logits': seq_logits,
                 'ms_logits': ms_logits,
                 'struct_logits': struct_logits,
-                'dist_predictions': dist_predictions,  # NEW: Distance predictions
                 'seq_hidden': seq_hidden,
                 'ms_hidden': ms_hidden,
                 'struct_hidden': struct_hidden,
@@ -970,7 +970,7 @@ class MultimodalGlycanBERT(nn.Module):
 if __name__ == "__main__":
     # Test the model
     print("="*80)
-    print("Testing Multimodal GlycanBERT v3")
     print("="*80)
     # Create config
@@ -1081,4 +1081,3 @@ if __name__ == "__main__":
     print(f"\n{'='*80}")
     print("Model Test Complete!")
     print("="*80)

 """
+Bertose model
+Core glycan representation model with three modalities:
 - Sequence (WURCS atomic tokenization)
 - MS (mass spectrometry peaks, RT, intensity)
 - 3D structure (VQ-VAE discrete tokens, 4 per residue)
             config.max_position_embeddings, config.hidden_size
         )
+        # Branch depth embeddings encode depth in the glycan tree.
         max_branch_depth = getattr(config, "max_branch_depth", 8)
         self.branch_embeddings = nn.Embedding(max_branch_depth, config.hidden_size)
+        # Linkage type embeddings encode glycosidic bond chemistry.
         # 0=none, 1=1-3, 2=1-4, 3=1-6, etc.
         num_linkage_types = getattr(config, "num_linkage_types", 9)
         self.linkage_embeddings = nn.Embedding(num_linkage_types, config.hidden_size)
         position_ids = self.position_ids[:, :seq_len]
         x = x + self.position_embeddings(position_ids)
+        # Add branch depth embeddings.
         if branch_depths is not None:
             # Clamp to valid range
             branch_depths = branch_depths.clamp(0, self.branch_embeddings.num_embeddings - 1)
             x = x + self.branch_embeddings(branch_depths)
+        # Add linkage type embeddings.
         if linkage_types is not None:
             linkage_types = linkage_types.clamp(0, self.linkage_embeddings.num_embeddings - 1)
             x = x + self.linkage_embeddings(linkage_types)
 class MultimodalGlycanBERTConfig:
+    """Configuration for the Bertose model."""
     def __init__(
         self,
         self.seq_loss_weight = seq_loss_weight
         self.ms_loss_weight = ms_loss_weight
         self.struct_loss_weight = struct_loss_weight
+        self.dist_loss_weight = 0.25
         # Token IDs
         self.pad_token_id = pad_token_id
 class MultimodalGlycanBERT(nn.Module):
     """
+    Bertose model for glycan representation learning.
     Architecture:
     1. Separate encoders for each modality (sequence, MS, 3D structure)
         seq_config.cnn_kernel_size = config.cnn_kernel_size
         if config.use_cnn_frontend:
+            print(f"Enabled convolutional front-end (kernel={config.cnn_kernel_size})")
             self.seq_embeddings = ConvGlycanBERTEmbeddings(seq_config)
         else:
             self.seq_embeddings = GlycanBERTEmbeddings(seq_config)
         )
         # ===== Distance Prediction Head (Topology) =====
+        # Project down to 128 dimensions first to reduce memory use.
         # (Batch, 256, 256, 768) -> (Batch, 256, 256, 128) reduces memory by 6x
         self.dist_proj = nn.Linear(config.seq_hidden_size, 128)
         self.distance_head = nn.Sequential(
         seq_token_ids: torch.Tensor,
         seq_attention_mask: torch.Tensor,
         seq_residue_ids: torch.Tensor,
+        seq_branch_depths: Optional[torch.Tensor] = None,
+        seq_linkage_types: Optional[torch.Tensor] = None,
         ms_token_ids: torch.Tensor = None,
         ms_attention_mask: torch.Tensor = None,
         has_ms: torch.Tensor = None,
         seq_labels: Optional[torch.Tensor] = None,
         ms_labels: Optional[torch.Tensor] = None,
         struct_labels: Optional[torch.Tensor] = None,
+        dist_labels: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ) -> Dict[str, torch.Tensor]:
         """
+        Forward pass for Bertose.
         Args:
             seq_token_ids: (batch_size, seq_len) - Sequence token IDs
         seq_loss = None
         ms_loss = None
         struct_loss = None
+        dist_loss = None
         if seq_labels is not None:
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
                 'seq_loss': seq_loss,
                 'ms_loss': ms_loss,
                 'struct_loss': struct_loss,
+                'dist_loss': dist_loss,
                 'seq_logits': seq_logits,
                 'ms_logits': ms_logits,
                 'struct_logits': struct_logits,
+                'dist_predictions': dist_predictions,
                 'seq_hidden': seq_hidden,
                 'ms_hidden': ms_hidden,
                 'struct_hidden': struct_hidden,
 if __name__ == "__main__":
     # Test the model
     print("="*80)
+    print("Testing Bertose model")
     print("="*80)
     # Create config
     print(f"\n{'='*80}")
     print("Model Test Complete!")
     print("="*80)