Apply AFFINose display capitalization

Browse files

Files changed (8) hide show

README.md +9 -9
SHA256SUMS +7 -7
config.json +2 -2
src/affinose_dataset.py +4 -4
src/affinose_inference.py +6 -6
src/affinose_model.py +20 -20
src/bertose_layers.py +3 -3
src/bertose_model.py +5 -5

README.md CHANGED Viewed

@@ -11,19 +11,19 @@ tags:
 - pytorch
 ---
-# Affinose Interaction Model
-This repository contains the Affinose checkpoint for protein-glycan interaction inference. Affinose combines Bertose glycan token representations with per-residue ESM-C protein embeddings and returns a scalar interaction score.
 ## Files
-- `checkpoints/affinose_interaction_model.pt` - Affinose interaction checkpoint.
 - `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary for glycan tokenization.
-- `src/affinose_model.py` - Affinose architecture.
 - `src/affinose_inference.py` - standalone inference helper.
 - `src/affinose_dataset.py` - tokenizer and data utility helpers.
-- `src/bertose_model.py` - Bertose model definition used for glycan encoding.
-- `src/bertose_layers.py` - Transformer layers used by Bertose.
 - `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
 ## Input
@@ -32,7 +32,7 @@ Provide one protein-glycan pair or a CSV batch. Glycans should be WURCS strings.
 ## Protein Embedding Requirement
-Affinose expects per-residue ESM-C 300M embeddings with shape `[L, 960]`. Do not mean-pool the protein before passing it into Affinose.
 ESM-C is a separate EvolutionaryScale protein model. The ESM-C weights are not included in this repository. Users should install the `esm` package and let it download ESM-C 300M into their own runtime cache.
@@ -50,11 +50,11 @@ output = esmc.logits(
 protein_embeddings = output.embeddings  # per-residue ESM-C 300M embeddings
 ```
-If Hugging Face requests authentication for ESM-C, users should authenticate with their own Hugging Face account/token and accept any required EvolutionaryScale terms. Bertose/Affinose tokens are not required once these repositories are public.
 ## Output
-A scalar protein-glycan interaction score from the trained Affinose head.
 ## Scope

 - pytorch
 ---
+# AFFINose Interaction Model
+This repository contains the AFFINose checkpoint for protein-glycan interaction inference. AFFINose combines BERTose glycan token representations with per-residue ESM-C protein embeddings and returns a scalar interaction score.
 ## Files
+- `checkpoints/affinose_interaction_model.pt` - AFFINose interaction checkpoint.
 - `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary for glycan tokenization.
+- `src/affinose_model.py` - AFFINose architecture.
 - `src/affinose_inference.py` - standalone inference helper.
 - `src/affinose_dataset.py` - tokenizer and data utility helpers.
+- `src/bertose_model.py` - BERTose model definition used for glycan encoding.
+- `src/bertose_layers.py` - Transformer layers used by BERTose.
 - `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
 ## Input
 ## Protein Embedding Requirement
+AFFINose expects per-residue ESM-C 300M embeddings with shape `[L, 960]`. Do not mean-pool the protein before passing it into AFFINose.
 ESM-C is a separate EvolutionaryScale protein model. The ESM-C weights are not included in this repository. Users should install the `esm` package and let it download ESM-C 300M into their own runtime cache.
 protein_embeddings = output.embeddings  # per-residue ESM-C 300M embeddings
 ```
+If Hugging Face requests authentication for ESM-C, users should authenticate with their own Hugging Face account/token and accept any required EvolutionaryScale terms. BERTose/AFFINose tokens are not required once these repositories are public.
 ## Output
+A scalar protein-glycan interaction score from the trained AFFINose head.
 ## Scope

SHA256SUMS CHANGED Viewed

@@ -1,12 +1,12 @@
 622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5  ./.gitattributes
-f474c23adc30c94a8f6867c8260213eddb07c9732ab285078f2f08a9ad9fd062  ./README.md
 533fe4f9317782e39ad7980caa0f47ad92f1be999dd9489425a9429e2e7c15cb  ./checkpoints/affinose_interaction_model.pt
-043fcb1c7eb97e22fefe8fadadeff97b56ede254b95e318553175837a1e57114  ./config.json
 1be88f15fd905882c711f7ceb59b619a93f9cee2c6c7c031f8dbabd35b29e9e0  ./requirements.txt
-175811ef7be90383787858fafbb929d7a87e039bc0185d4d0f6d216dc92a48ed  ./src/affinose_dataset.py
-6f83790933a2e4f10160abdca9a6db1a4407ded780abf1d71407963a556dedb6  ./src/affinose_inference.py
-de14de370a77237ef2ac5c88714c83a54ce3f696efb710f93f8be19106c7fa95  ./src/affinose_model.py
-6362da8e8de0dc4d580c7d94ef6ab1dbc737da13127fc4078681ce6315180086  ./src/bertose_layers.py
-3c5b826fcf5850749f74d980eee48d0595557f3d6e2a58aa873902817eb65c64  ./src/bertose_model.py
 0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839  ./src/wurcs_bpe_tokenizer.py
 6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc  ./vocab/bpe_vocabulary.json

 622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5  ./.gitattributes
+f57fc6622ab0a2eca11a18e28d94744334b62640e25a00e5e3893f6eeefe3e7a  ./README.md
 533fe4f9317782e39ad7980caa0f47ad92f1be999dd9489425a9429e2e7c15cb  ./checkpoints/affinose_interaction_model.pt
+b8621448bb34d81b66e2b0050f3663375841d8f333d3973892ece32e7cc31880  ./config.json
 1be88f15fd905882c711f7ceb59b619a93f9cee2c6c7c031f8dbabd35b29e9e0  ./requirements.txt
+81c746162e528c4469ffadb2e3d71ce9d2c34fd4e4a8b176cf0808e26ba60617  ./src/affinose_dataset.py
+b4c729626e405cbe7564d7021f51e5b3ba1272e6318ad4439dcdf002ec88fde6  ./src/affinose_inference.py
+8041084cbe694d26d356feb7c0f66173e89c98b0ef18c73b407084e7a7d1b7a3  ./src/affinose_model.py
+3587aa789041e4ee215dcaa0286847d45621e9514c0ed3efa314fb89cf40f4d7  ./src/bertose_layers.py
+773023d70be02ecf4a26b79b18b2b70ee1b39e01ec287386cb162590aaf90767  ./src/bertose_model.py
 0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839  ./src/wurcs_bpe_tokenizer.py
 6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc  ./vocab/bpe_vocabulary.json

config.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "model_family": "Affinose",
   "release_name": "affinose-interaction-model",
   "checkpoint": "checkpoints/affinose_interaction_model.pt",
   "vocabulary": "vocab/bpe_vocabulary.json",
-  "glycan_encoder": "Bertose glycan encoder",
   "glycan_dim": 768,
   "protein_encoder": "ESM-C 300M",
   "protein_dim": 960,

 {
+  "model_family": "AFFINose",
   "release_name": "affinose-interaction-model",
   "checkpoint": "checkpoints/affinose_interaction_model.pt",
   "vocabulary": "vocab/bpe_vocabulary.json",
+  "glycan_encoder": "BERTose glycan encoder",
   "glycan_dim": 768,
   "protein_encoder": "ESM-C 300M",
   "protein_dim": 960,

src/affinose_dataset.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
-Affinose dataset utilities — per-residue protein embeddings for cross-attention
-Affinose keeps per-residue ESM-C embeddings [L, 960] rather than mean-pooled vectors so glycan tokens can cross-attend to protein residues.
 """
 import json
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 def load_bpe_tokenizer(vocab_path: str):
     """
-    Load the Bertose BPE tokenizer directly from source.
     Bypasses downstream_tasks package imports. Adds utils
     directory to sys.path and imports WURCSBPETokenizer directly.
@@ -75,7 +75,7 @@ class AffinoseInteractionDataset(Dataset):
     Dataset for glycan-protein interaction with cross-attention support.
     Returns:
-      - BPE-tokenized glycan sequences for live Bertose forward pass
       - Per-residue ESM-C protein embeddings [Lp, D] (NOT mean-pooled)
       - Masks for both sides (for cross-attention padding)

 """
+AFFINose dataset utilities — per-residue protein embeddings for cross-attention
+AFFINose keeps per-residue ESM-C embeddings [L, 960] rather than mean-pooled vectors so glycan tokens can cross-attend to protein residues.
 """
 import json
 def load_bpe_tokenizer(vocab_path: str):
     """
+    Load the BERTose BPE tokenizer directly from source.
     Bypasses downstream_tasks package imports. Adds utils
     directory to sys.path and imports WURCSBPETokenizer directly.
     Dataset for glycan-protein interaction with cross-attention support.
     Returns:
+      - BPE-tokenized glycan sequences for live BERTose forward pass
       - Per-residue ESM-C protein embeddings [Lp, D] (NOT mean-pooled)
       - Masks for both sides (for cross-attention padding)

src/affinose_inference.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
-Affinose inference — standalone prediction pipeline
-Architecture: Bertose glycan encoding + ESM-C protein embeddings + cross-attention fusion.
-    GLYCAN:  WURCS -> BPE -> Bertose (frozen) -> [B, Lg, 768] -> proj -> [B, Lg, 512]
     PROTEIN: ESM-C per-residue -> [B, Lp, 960] -> proj -> [B, Lp, 512]
                                     |
               2x CrossAttentionBlock(d=512, 8H, FFN=1024)
@@ -50,11 +50,11 @@ class AffinosePredictor:
         logger.info(f"Loading BPE tokenizer from {vocab_path}")
         self.tokenizer = load_bpe_tokenizer(vocab_path)
-        logger.info(f"Loading Bertose from {bertose_checkpoint}")
         bertose_config, seq_embeddings, seq_layers = load_bertose_encoder(
             bertose_checkpoint, freeze_layers=12)
-        logger.info("Building Affinose interaction model")
         self.model = AffinoseInteractionModel(
             seq_embeddings=seq_embeddings,
             seq_layers=seq_layers,
@@ -149,7 +149,7 @@ class AffinosePredictor:
 def main():
-    parser = argparse.ArgumentParser(description="Affinose interaction inference")
     parser.add_argument("--checkpoint", required=True)
     parser.add_argument("--bertose_checkpoint", required=True)
     parser.add_argument("--vocab_path", required=True)

 """
+AFFINose inference — standalone prediction pipeline
+Architecture: BERTose glycan encoding + ESM-C protein embeddings + cross-attention fusion.
+    GLYCAN:  WURCS -> BPE -> BERTose (frozen) -> [B, Lg, 768] -> proj -> [B, Lg, 512]
     PROTEIN: ESM-C per-residue -> [B, Lp, 960] -> proj -> [B, Lp, 512]
                                     |
               2x CrossAttentionBlock(d=512, 8H, FFN=1024)
         logger.info(f"Loading BPE tokenizer from {vocab_path}")
         self.tokenizer = load_bpe_tokenizer(vocab_path)
+        logger.info(f"Loading BERTose from {bertose_checkpoint}")
         bertose_config, seq_embeddings, seq_layers = load_bertose_encoder(
             bertose_checkpoint, freeze_layers=12)
+        logger.info("Building AFFINose interaction model")
         self.model = AffinoseInteractionModel(
             seq_embeddings=seq_embeddings,
             seq_layers=seq_layers,
 def main():
+    parser = argparse.ArgumentParser(description="AFFINose interaction inference")
     parser.add_argument("--checkpoint", required=True)
     parser.add_argument("--bertose_checkpoint", required=True)
     parser.add_argument("--vocab_path", required=True)

src/affinose_model.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
-Affinose interaction model — cross-attention with live Bertose encoding
 Architecture:
-  GLYCAN:  WURCS → BPE → Bertose (live, freeze layers 0-3) → [B, Lg, 768]
                                                                 ↓ proj(768→512)
   PROTEIN: precomputed ESM-C → [B, Lp, 960]                    ↓
                                  ↓ proj(960→512)                ↓
@@ -15,7 +15,7 @@ Architecture:
                       [B, 1024]
                       ↓ MLP → binding score
-This release exposes the manuscript-facing Affinose architecture: Bertose glycan tokens, per-residue ESM-C protein embeddings, bidirectional cross-attention, pooled fusion and scalar interaction scoring.
 """
 import os
@@ -30,10 +30,10 @@ import torch.nn.functional as F
 # ============================================================================
-# Bertose model imports
 # ============================================================================
 def _default_bertose_root() -> Path:
-    """Resolve the Bertose source root without assuming a specific local path."""
     env_root = os.environ.get("BERTOSE_ROOT") or os.environ.get("BERTOSE_REPO_ROOT")
     if env_root:
         return Path(env_root).expanduser().resolve()
@@ -50,7 +50,7 @@ BERTOSE_ROOT = _default_bertose_root()
 def _ensure_bertose_imports():
-    """Add Bertose source directories to sys.path if not already present."""
     source_dir = Path(__file__).resolve().parent
     roots = [
         str(source_dir),
@@ -63,7 +63,7 @@ def _ensure_bertose_imports():
 def load_bertose_config():
-    """Create Bertose config matching the Bertose glycan encoder checkpoint."""
     _ensure_bertose_imports()
     try:
         from model.bertose_model import MultimodalGlycanBERTConfig
@@ -80,10 +80,10 @@ def load_bertose_encoder(
     checkpoint_path: str, freeze_layers: int = 4
 ):
     """
-    Load Bertose sequence encoder with pretrained weights.
     Args:
-        checkpoint_path: Path to pretrained Bertose checkpoint.
         freeze_layers: Number of transformer layers to freeze (0-indexed).
     Returns:
@@ -145,7 +145,7 @@ def load_bertose_encoder(
         p.numel() for layer in seq_layers for p in layer.parameters()
     )
     print(
-        f"  Bertose encoder: {total:,} params total, "
         f"{trainable:,} trainable (frozen layers 0-{freeze_layers - 1})"
     )
@@ -440,7 +440,7 @@ class CrossAttentionBlock(nn.Module):
 # ============================================================================
-# Affinose interaction model
 # ============================================================================
@@ -448,7 +448,7 @@ class AffinoseInteractionModel(nn.Module):
     """
     Glycan-protein interaction predictor with cross-attention.
-    Glycan: Live Bertose (partially frozen) → per-token [B, Lg, 768]
     Protein: Precomputed ESM-C per-residue [B, Lp, 960]
     Cross-attention: 2 bidirectional layers in shared 512-dim space
     SWE: Variable-length → fixed [B, 512] for each side
@@ -476,9 +476,9 @@ class AffinoseInteractionModel(nn.Module):
     ):
         """
         Args:
-            seq_embeddings: Pretrained Bertose embedding layer.
-            seq_layers: Pretrained Bertose transformer layers.
-            glycan_dim: Bertose output dimension (768).
             protein_dim: ESM-C per-residue dimension (960).
             shared_dim: Shared space for cross-attention (512).
             num_cross_layers: Number of cross-attention blocks.
@@ -505,7 +505,7 @@ class AffinoseInteractionModel(nn.Module):
         print(f"    pooling_mode={pooling_mode}")
         print(f"    interaction_mode={interaction_mode}")
-        # === Bertose sequence encoder (partially frozen) ===
         self.seq_embeddings = seq_embeddings
         self.seq_layers = seq_layers
@@ -634,7 +634,7 @@ class AffinoseInteractionModel(nn.Module):
         Returns:
             [B] binding score predictions.
         """
-        # === 1. Bertose forward: per-token embeddings ===
         x = self.seq_embeddings(token_ids, branch_depths, linkage_types)
         for layer in self.seq_layers:
             x = layer(x, attention_mask)
@@ -727,12 +727,12 @@ class AffinoseInteractionLoss(nn.Module):
 # ============================================================================
 if __name__ == "__main__":
     print("=" * 60)
-    print("Affinose interaction model architecture sanity check")
     print("=" * 60)
-    # Mock Bertose encoder (for testing without cluster)
     class MockEmbeddings(nn.Module):
-        """Mock Bertose embeddings for local testing."""
         def __init__(self, dim: int = 768):
             super().__init__()

 """
+AFFINose interaction model — cross-attention with live BERTose encoding
 Architecture:
+  GLYCAN:  WURCS → BPE → BERTose (live, freeze layers 0-3) → [B, Lg, 768]
                                                                 ↓ proj(768→512)
   PROTEIN: precomputed ESM-C → [B, Lp, 960]                    ↓
                                  ↓ proj(960→512)                ↓
                       [B, 1024]
                       ↓ MLP → binding score
+This release exposes the manuscript-facing AFFINose architecture: BERTose glycan tokens, per-residue ESM-C protein embeddings, bidirectional cross-attention, pooled fusion and scalar interaction scoring.
 """
 import os
 # ============================================================================
+# BERTose model imports
 # ============================================================================
 def _default_bertose_root() -> Path:
+    """Resolve the BERTose source root without assuming a specific local path."""
     env_root = os.environ.get("BERTOSE_ROOT") or os.environ.get("BERTOSE_REPO_ROOT")
     if env_root:
         return Path(env_root).expanduser().resolve()
 def _ensure_bertose_imports():
+    """Add BERTose source directories to sys.path if not already present."""
     source_dir = Path(__file__).resolve().parent
     roots = [
         str(source_dir),
 def load_bertose_config():
+    """Create BERTose config matching the BERTose glycan encoder checkpoint."""
     _ensure_bertose_imports()
     try:
         from model.bertose_model import MultimodalGlycanBERTConfig
     checkpoint_path: str, freeze_layers: int = 4
 ):
     """
+    Load BERTose sequence encoder with pretrained weights.
     Args:
+        checkpoint_path: Path to pretrained BERTose checkpoint.
         freeze_layers: Number of transformer layers to freeze (0-indexed).
     Returns:
         p.numel() for layer in seq_layers for p in layer.parameters()
     )
     print(
+        f"  BERTose encoder: {total:,} params total, "
         f"{trainable:,} trainable (frozen layers 0-{freeze_layers - 1})"
     )
 # ============================================================================
+# AFFINose interaction model
 # ============================================================================
     """
     Glycan-protein interaction predictor with cross-attention.
+    Glycan: Live BERTose (partially frozen) → per-token [B, Lg, 768]
     Protein: Precomputed ESM-C per-residue [B, Lp, 960]
     Cross-attention: 2 bidirectional layers in shared 512-dim space
     SWE: Variable-length → fixed [B, 512] for each side
     ):
         """
         Args:
+            seq_embeddings: Pretrained BERTose embedding layer.
+            seq_layers: Pretrained BERTose transformer layers.
+            glycan_dim: BERTose output dimension (768).
             protein_dim: ESM-C per-residue dimension (960).
             shared_dim: Shared space for cross-attention (512).
             num_cross_layers: Number of cross-attention blocks.
         print(f"    pooling_mode={pooling_mode}")
         print(f"    interaction_mode={interaction_mode}")
+        # === BERTose sequence encoder (partially frozen) ===
         self.seq_embeddings = seq_embeddings
         self.seq_layers = seq_layers
         Returns:
             [B] binding score predictions.
         """
+        # === 1. BERTose forward: per-token embeddings ===
         x = self.seq_embeddings(token_ids, branch_depths, linkage_types)
         for layer in self.seq_layers:
             x = layer(x, attention_mask)
 # ============================================================================
 if __name__ == "__main__":
     print("=" * 60)
+    print("AFFINose interaction model architecture sanity check")
     print("=" * 60)
+    # Mock BERTose encoder (for testing without cluster)
     class MockEmbeddings(nn.Module):
+        """Mock BERTose embeddings for local testing."""
         def __init__(self, dim: int = 768):
             super().__init__()

src/bertose_layers.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Bertose transformer layers.
 Transformer blocks adapted for WURCS glycan tokenization.
 """
@@ -10,7 +10,7 @@ import math
 class GlycanBERTConfig:
-    """Configuration for the Bertose transformer stack."""
     def __init__(
         self,
@@ -202,7 +202,7 @@ class GlycanBERTLayer(nn.Module):
 class GlycanBERT(nn.Module):
     """
-    Bertose transformer stack for masked language modeling.
     """
     def __init__(self, config: GlycanBERTConfig):

 """
+BERTose transformer layers.
 Transformer blocks adapted for WURCS glycan tokenization.
 """
 class GlycanBERTConfig:
+    """Configuration for the BERTose transformer stack."""
     def __init__(
         self,
 class GlycanBERT(nn.Module):
     """
+    BERTose transformer stack for masked language modeling.
     """
     def __init__(self, config: GlycanBERTConfig):

src/bertose_model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Bertose model
 Core glycan representation model with three modalities:
 - Sequence (WURCS atomic tokenization)
@@ -148,7 +148,7 @@ def create_residue_level_mask(
 class MultimodalGlycanBERTConfig:
-    """Configuration for the Bertose model."""
     def __init__(
         self,
@@ -610,7 +610,7 @@ class CrossAttentionLayer(nn.Module):
 class MultimodalGlycanBERT(nn.Module):
     """
-    Bertose model for glycan representation learning.
     Architecture:
     1. Separate encoders for each modality (sequence, MS, 3D structure)
@@ -722,7 +722,7 @@ class MultimodalGlycanBERT(nn.Module):
         return_dict: bool = True,
     ) -> Dict[str, torch.Tensor]:
         """
-        Forward pass for Bertose.
         Args:
             seq_token_ids: (batch_size, seq_len) - Sequence token IDs
@@ -970,7 +970,7 @@ class MultimodalGlycanBERT(nn.Module):
 if __name__ == "__main__":
     # Test the model
     print("="*80)
-    print("Testing Bertose model")
     print("="*80)
     # Create config

 """
+BERTose model
 Core glycan representation model with three modalities:
 - Sequence (WURCS atomic tokenization)
 class MultimodalGlycanBERTConfig:
+    """Configuration for the BERTose model."""
     def __init__(
         self,
 class MultimodalGlycanBERT(nn.Module):
     """
+    BERTose model for glycan representation learning.
     Architecture:
     1. Separate encoders for each modality (sequence, MS, 3D structure)
         return_dict: bool = True,
     ) -> Dict[str, torch.Tensor]:
         """
+        Forward pass for BERTose.
         Args:
             seq_token_ids: (batch_size, seq_len) - Sequence token IDs
 if __name__ == "__main__":
     # Test the model
     print("="*80)
+    print("Testing BERTose model")
     print("="*80)
     # Create config