Add files using upload-large-folder tool

Browse files

Files changed (10) hide show

.gitattributes +1 -33
README.md +37 -0
SHA256SUMS +9 -0
checkpoints/best_v51_contrastive_model.pt +3 -0
config.json +11 -0
requirements.txt +5 -0
src/multimodal_glycan_bert_v3.py +1084 -0
src/wurcs_bpe_tokenizer.py +740 -0
vocab/bpe_ambiguity_tokens.json +721 -0
vocab/bpe_vocabulary.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+---
+library_name: pytorch
+license: other
+tags:
+- glycans
+- wurcs
+- bertose
+- ambiguity-resolution
+- contrastive-learning
+- pytorch
+---
+# Bertose IAR Ambiguity Resolver
+Draft private release for Bertose ambiguity-resolution inference.
+This repository contains the contrastive Bertose checkpoint used to score ambiguous WURCS BPE tokens and support iterative ambiguity resolution.
+## Files
+- `checkpoints/best_v51_contrastive_model.pt` - contrastive ambiguity-resolution checkpoint.
+- `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary.
+- `vocab/bpe_ambiguity_tokens.json` - ambiguous BPE token map used by the resolver.
+- `src/multimodal_glycan_bert_v3.py` - model definition.
+- `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
+## Expected Input
+Single glycan or batch CSV with WURCS strings.
+## Output
+Token-level ambiguity-resolution predictions with confidence scores. The companion notebook writes both summary and detail CSVs for batch runs.
+## Draft Notes
+This release does not claim to reconstruct final canonical WURCS strings by itself. It provides model-backed token-level updates and confidence values for ambiguous positions.

SHA256SUMS ADDED Viewed

	@@ -0,0 +1,9 @@

+622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5  ./.gitattributes
+266caeb2fb9b68076343b40da91116dca0f2302f03cf28c2332b80b1a69c1758  ./README.md
+ae468f4e8c06dc0c3848138a474dc43249aa6d14dfd0df8f58d68fcaad371152  ./checkpoints/best_v51_contrastive_model.pt
+daf55c190fece0678064e41697a9545592beb1285f8aa74e595b933b9d37b4c2  ./config.json
+6a56e6f73b8f874470ecde6e538f3f5029ae23aa6c10559817d1c2a8b59b7c0f  ./requirements.txt
+0d9ce16bf90242f38621d64cd974ea5679bff4c2013bea8d7bffe1b8dd120794  ./src/multimodal_glycan_bert_v3.py
+0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839  ./src/wurcs_bpe_tokenizer.py
+c68cd003370b2dcdb162f848f766e4e62f2653c6c38d205f8cbe53a9aabe2d74  ./vocab/bpe_ambiguity_tokens.json
+6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc  ./vocab/bpe_vocabulary.json

checkpoints/best_v51_contrastive_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae468f4e8c06dc0c3848138a474dc43249aa6d14dfd0df8f58d68fcaad371152
+size 557458637

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "model_family": "Bertose",
+  "release_name": "bertose-iar-ambiguity-resolver",
+  "checkpoint": "checkpoints/best_v51_contrastive_model.pt",
+  "vocabulary": "vocab/bpe_vocabulary.json",
+  "ambiguity_tokens": "vocab/bpe_ambiguity_tokens.json",
+  "embedding_dim": 768,
+  "max_glycan_length": 256,
+  "input_format": "WURCS",
+  "output_format": "token_level_predictions"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+numpy
+pandas
+tqdm
+huggingface_hub

src/multimodal_glycan_bert_v3.py ADDED Viewed

	@@ -0,0 +1,1084 @@

+"""
+Multimodal Glycan BERT Model v3
+Extends GlycanBERT to handle three modalities:
+- Sequence (WURCS atomic tokenization)
+- MS (mass spectrometry peaks, RT, intensity)
+- 3D structure (VQ-VAE discrete tokens, 4 per residue)
+Each modality has its own encoder, with cross-attention for sequence-structure alignment.
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, Optional, Tuple
+import math
+try:
+    from .glycan_bert import GlycanBERTConfig, GlycanBERTEmbeddings, GlycanBERTLayer
+except ImportError:
+    from glycan_bert import GlycanBERTConfig, GlycanBERTEmbeddings, GlycanBERTLayer
+class ConvGlycanBERTEmbeddings(nn.Module):
+    """
+    Improved Convolutional front-end that mixes local WURCS context before the Transformer.
+    Key improvements over original:
+    1. Position embeddings added BEFORE convolution (provides spatial context to conv)
+    2. Residual connection (conv enriches embeddings rather than replacing them)
+    3. Multi-scale convolutions (kernel sizes 3, 5, 7) for better receptive field
+    4. Proper layer normalization on the residual path
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.token_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+        # NEW: Branch depth embeddings - encodes depth in glycan tree (0=root, 1=child, etc.)
+        max_branch_depth = getattr(config, "max_branch_depth", 8)
+        self.branch_embeddings = nn.Embedding(max_branch_depth, config.hidden_size)
+        # NEW: Linkage type embeddings - encodes chemistry of glycosidic bond
+        # 0=none, 1=1-3, 2=1-4, 3=1-6, etc.
+        num_linkage_types = getattr(config, "num_linkage_types", 9)
+        self.linkage_embeddings = nn.Embedding(num_linkage_types, config.hidden_size)
+        # Multi-scale convolutions for different receptive fields
+        kernel_size = getattr(config, "cnn_kernel_size", 3)
+        # Split channels evenly: 256 + 256 + 256 = 768 for hidden_size=768
+        channels_per_scale = config.hidden_size // 3
+        self.conv_layers = nn.ModuleList([
+            nn.Conv1d(
+                in_channels=config.hidden_size,
+                out_channels=channels_per_scale,
+                kernel_size=kernel_size + 2 * i,  # Kernels: 3, 5, 7
+                padding=(kernel_size + 2 * i) // 2,  # Same padding
+            )
+            for i in range(3)
+        ])
+        self.conv_activation = nn.GELU()
+        self.conv_proj = nn.Linear(channels_per_scale * 3, config.hidden_size)  # Project concatenated back
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.conv_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+        )
+        self.hidden_size = config.hidden_size
+    def forward(self, input_ids, branch_depths=None, linkage_types=None):
+        seq_len = input_ids.shape[1]
+        # Step 1: Token + Position embeddings FIRST (provides spatial context to conv)
+        x = self.token_embeddings(input_ids)  # (batch, seq, hidden)
+        position_ids = self.position_ids[:, :seq_len]
+        x = x + self.position_embeddings(position_ids)
+        # NEW: Add branch depth embeddings (encodes tree structure)
+        if branch_depths is not None:
+            # Clamp to valid range
+            branch_depths = branch_depths.clamp(0, self.branch_embeddings.num_embeddings - 1)
+            x = x + self.branch_embeddings(branch_depths)
+        # NEW: Add linkage type embeddings (encodes bond chemistry)
+        if linkage_types is not None:
+            linkage_types = linkage_types.clamp(0, self.linkage_embeddings.num_embeddings - 1)
+            x = x + self.linkage_embeddings(linkage_types)
+        x = self.LayerNorm(x)
+        # Step 2: Multi-scale convolution with RESIDUAL connection
+        # Convolution expects (batch, hidden, seq)
+        conv_in = x.permute(0, 2, 1)
+        # Apply multi-scale convolutions and concatenate
+        conv_outputs = []
+        for conv in self.conv_layers:
+            conv_out = self.conv_activation(conv(conv_in))
+            conv_outputs.append(conv_out)
+        # Concatenate multi-scale features and project back
+        conv_out = torch.cat(conv_outputs, dim=1)  # (batch, hidden, seq)
+        conv_out = conv_out.permute(0, 2, 1)  # (batch, seq, hidden)
+        conv_out = self.conv_proj(conv_out)  # Project to correct size
+        # Step 3: Residual connection - conv ENRICHES rather than replaces
+        x = self.conv_norm(x + self.dropout(conv_out))
+        return x
+def create_residue_level_mask(
+    seq_residue_ids: torch.Tensor,    # (batch, N_seq)
+    struct_residue_ids: torch.Tensor  # (batch, N_struct)
+) -> torch.Tensor:
+    """
+    Create residue-level attention mask for cross-attention.
+    Maps WURCS tokens to VQ-VAE structural tokens based on residue IDs.
+    A WURCS token with residue_id=0 can only attend to VQ-VAE tokens with residue_id=0.
+    Args:
+        seq_residue_ids: Residue IDs for sequence tokens (batch, N_seq)
+        struct_residue_ids: Residue IDs for structural tokens (batch, N_struct)
+    Returns:
+        Boolean mask (batch, N_seq, N_struct) where True = can attend
+    """
+    # Expand dimensions for broadcasting
+    # seq: (batch, N_seq, 1)
+    # struct: (batch, 1, N_struct)
+    mask = seq_residue_ids.unsqueeze(2) == struct_residue_ids.unsqueeze(1)
+    # Shape: (batch, N_seq, N_struct)
+    # Mask out structural tokens (residue_id = -1) and MS tokens (residue_id = -2)
+    # Only tokens with residue_id >= 0 can attend
+    mask &= (seq_residue_ids.unsqueeze(2) >= 0)
+    return mask  # True = can attend, False = cannot attend
+class MultimodalGlycanBERTConfig:
+    """Configuration for Multimodal GlycanBERT v3."""
+    def __init__(
+        self,
+        # Sequence modality
+        seq_vocab_size: int = 166,
+        seq_hidden_size: int = 768,
+        seq_num_layers: int = 12,
+        seq_num_heads: int = 12,
+        seq_max_length: int = 512,
+        # MS modality
+        ms_vocab_size: int = 242,
+        ms_hidden_size: int = 384,
+        ms_num_layers: int = 6,
+        ms_num_heads: int = 6,
+        ms_max_length: int = 150,
+        # 3D structure modality
+        struct_vocab_size: int = 1024,  # VQ-VAE codebook size
+        struct_hidden_size: int = 512,
+        struct_num_layers: int = 8,
+        struct_num_heads: int = 8,
+        struct_max_length: int = 200,
+        use_3d: bool = True,
+        # Cross-attention
+        use_cross_attention: bool = True,
+        cross_attn_num_heads: int = 8,
+        # Fusion
+        fusion_hidden_size: int = 768,
+        fusion_num_layers: int = 2,
+        # Training
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        layer_norm_eps: float = 1e-12,
+        initializer_range: float = 0.02,
+        # Conv front-end
+        use_cnn_frontend: bool = True,
+        cnn_kernel_size: int = 3,
+        # Loss weights
+        seq_loss_weight: float = 0.60,
+        ms_loss_weight: float = 0.15,
+        struct_loss_weight: float = 0.25,
+        # Token IDs
+        pad_token_id: int = 0,
+        mask_token_id: int = 1,
+    ):
+        # Sequence config
+        self.seq_vocab_size = seq_vocab_size
+        self.seq_hidden_size = seq_hidden_size
+        self.seq_num_layers = seq_num_layers
+        self.seq_num_heads = seq_num_heads
+        self.seq_max_length = seq_max_length
+        # MS config
+        self.ms_vocab_size = ms_vocab_size
+        self.ms_vocab_offset = seq_vocab_size  # MS tokens start at 166
+        self.ms_total_vocab_size = seq_vocab_size + ms_vocab_size  # 408 total
+        self.ms_hidden_size = ms_hidden_size
+        self.ms_num_layers = ms_num_layers
+        self.ms_num_heads = ms_num_heads
+        self.ms_max_length = ms_max_length
+        # Structure config
+        self.struct_vocab_size = struct_vocab_size
+        self.struct_hidden_size = struct_hidden_size
+        self.struct_num_layers = struct_num_layers
+        self.struct_num_heads = struct_num_heads
+        self.struct_max_length = struct_max_length
+        self.use_3d = use_3d
+        # Cross-attention config
+        self.use_cross_attention = use_cross_attention
+        self.cross_attn_num_heads = cross_attn_num_heads
+        # Fusion config
+        self.fusion_hidden_size = fusion_hidden_size
+        self.fusion_num_layers = fusion_num_layers
+        # Training config
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # Conv front-end
+        self.use_cnn_frontend = use_cnn_frontend
+        self.cnn_kernel_size = cnn_kernel_size
+        # Loss weights
+        self.seq_loss_weight = seq_loss_weight
+        self.ms_loss_weight = ms_loss_weight
+        self.struct_loss_weight = struct_loss_weight
+        self.dist_loss_weight = 0.25  # NEW: Topology loss weight (default, can override from config)
+        # Token IDs
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+    def to_seq_config(self) -> GlycanBERTConfig:
+        """Convert to sequence-only config."""
+        return GlycanBERTConfig(
+            vocab_size=self.seq_vocab_size,
+            hidden_size=self.seq_hidden_size,
+            num_hidden_layers=self.seq_num_layers,
+            num_attention_heads=self.seq_num_heads,
+            intermediate_size=self.seq_hidden_size * 4,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.seq_max_length,
+            layer_norm_eps=self.layer_norm_eps,
+            pad_token_id=self.pad_token_id,
+            mask_token_id=self.mask_token_id,
+            initializer_range=self.initializer_range,
+        )
+    def to_ms_config(self) -> GlycanBERTConfig:
+        """Convert to MS-only config."""
+        return GlycanBERTConfig(
+            vocab_size=self.ms_total_vocab_size,
+            hidden_size=self.ms_hidden_size,
+            num_hidden_layers=self.ms_num_layers,
+            num_attention_heads=self.ms_num_heads,
+            intermediate_size=self.ms_hidden_size * 4,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.ms_max_length,
+            layer_norm_eps=self.layer_norm_eps,
+            pad_token_id=self.pad_token_id,
+            mask_token_id=self.mask_token_id,
+            initializer_range=self.initializer_range,
+        )
+    def to_struct_config(self) -> GlycanBERTConfig:
+        """Convert to structure-only config."""
+        return GlycanBERTConfig(
+            vocab_size=self.struct_vocab_size,
+            hidden_size=self.struct_hidden_size,
+            num_hidden_layers=self.struct_num_layers,
+            num_attention_heads=self.struct_num_heads,
+            intermediate_size=self.struct_hidden_size * 4,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.struct_max_length,
+            layer_norm_eps=self.layer_norm_eps,
+            pad_token_id=self.pad_token_id,
+            mask_token_id=self.mask_token_id,
+            initializer_range=self.initializer_range,
+        )
+# =============================================================================
+# Improvement #1: Monosaccharide-Level Pooling
+# =============================================================================
+class MonosaccharidePooling(nn.Module):
+    """
+    Pool token representations to monosaccharide level, then aggregate.
+    This bridges the gap between token-level BERT and monosaccharide-level CNNs/GNNs.
+    Uses monosaccharide_indices from the data to know where each residue starts.
+    """
+    def __init__(self, hidden_size: int, num_attention_heads: int = 8, dropout: float = 0.1):
+        super().__init__()
+        self.hidden_size = hidden_size
+        # Attention pooling over monosaccharide representations
+        self.mono_attention = nn.MultiheadAttention(
+            embed_dim=hidden_size,
+            num_heads=num_attention_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.mono_norm = nn.LayerNorm(hidden_size)
+        # Final aggregation to single glycan representation
+        self.glycan_query = nn.Parameter(torch.randn(1, 1, hidden_size) * 0.02)
+        self.glycan_attention = nn.MultiheadAttention(
+            embed_dim=hidden_size,
+            num_heads=num_attention_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.glycan_norm = nn.LayerNorm(hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,      # (batch, seq_len, hidden)
+        residue_ids: torch.Tensor,         # (batch, seq_len) - which residue each token belongs to
+        attention_mask: torch.Tensor = None,  # (batch, seq_len)
+    ) -> torch.Tensor:
+        """
+        Pool tokens to monosaccharide level, then to glycan level.
+        Returns:
+            Glycan representation: (batch, hidden_size)
+        """
+        batch_size = hidden_states.size(0)
+        device = hidden_states.device
+        # Get unique residue IDs per sample (excluding -1 padding)
+        max_residues = 50  # Reasonable max for glycans
+        # Pool tokens within each residue using mean pooling
+        mono_reps = torch.zeros(batch_size, max_residues, self.hidden_size, device=device)
+        mono_mask = torch.zeros(batch_size, max_residues, dtype=torch.bool, device=device)
+        for b in range(batch_size):
+            unique_residues = torch.unique(residue_ids[b][residue_ids[b] >= 0])
+            for i, rid in enumerate(unique_residues):
+                if i >= max_residues:
+                    break
+                token_mask = residue_ids[b] == rid
+                if attention_mask is not None:
+                    token_mask = token_mask & (attention_mask[b] > 0)
+                if token_mask.sum() > 0:
+                    mono_reps[b, i] = hidden_states[b][token_mask].mean(dim=0)
+                    mono_mask[b, i] = True
+        # Apply attention over monosaccharide representations
+        # Convert mask for attention: True = valid, need to invert for PyTorch
+        key_padding_mask = ~mono_mask  # True = ignore
+        mono_out, _ = self.mono_attention(
+            mono_reps, mono_reps, mono_reps,
+            key_padding_mask=key_padding_mask
+        )
+        mono_out = self.mono_norm(mono_reps + mono_out)
+        # Aggregate to single glycan representation using learned query
+        glycan_query = self.glycan_query.expand(batch_size, -1, -1)
+        glycan_out, _ = self.glycan_attention(
+            glycan_query, mono_out, mono_out,
+            key_padding_mask=key_padding_mask
+        )
+        glycan_out = self.glycan_norm(glycan_query + glycan_out)
+        return glycan_out.squeeze(1)  # (batch, hidden)
+# =============================================================================
+# Improvement #2: Residue Type Embeddings
+# =============================================================================
+# Common monosaccharide types vocabulary
+MONOSACCHARIDE_VOCAB = {
+    '[PAD_MONO]': 0, '[UNK_MONO]': 1,
+    'Glc': 2, 'GlcNAc': 3, 'GlcA': 4, 'GlcN': 5,
+    'Gal': 6, 'GalNAc': 7, 'GalA': 8, 'GalN': 9,
+    'Man': 10, 'ManNAc': 11, 'ManA': 12, 'ManN': 13,
+    'Fuc': 14, 'Rha': 15, 'Xyl': 16, 'Ara': 17,
+    'Neu5Ac': 18, 'Neu5Gc': 19, 'Kdn': 20, 'Sia': 21,
+    'GalNAcA': 22, 'GlcNAcA': 23, 'IdoA': 24, 'GulA': 25,
+    'Rib': 26, 'Lyx': 27, 'All': 28, 'Alt': 29,
+    'Tal': 30, 'Ido': 31, 'Qui': 32, 'Oli': 33,
+    'Tyv': 34, 'Abe': 35, 'Par': 36, 'Dig': 37,
+    'Col': 38, 'Dha': 39, 'Kdo': 40, 'Hep': 41,
+    'NeuroGc': 42, 'Muramic': 43, 'LDManHep': 44, 'DDManHep': 45,
+    'Bac': 46, 'Pse': 47, 'Leg': 48, 'Aci': 49,
+    '6dTal': 50, 'Fru': 51, 'Tag': 52, 'Sor': 53,
+    'Psi': 54, 'Sed': 55, 'MurNAc': 56, 'MurNGc': 57,
+    'Api': 58, 'Erwiniose': 59, 'Yer': 60, 'Thre': 61,
+    # Add more as needed, up to ~70
+}
+class ResidueTypeEmbeddings(nn.Module):
+    """
+    Learnable embeddings for monosaccharide types.
+    Instead of the model having to learn that 'a1221m' = Fucose from character patterns,
+    we explicitly add a Fucose embedding to all tokens belonging to that residue.
+    """
+    def __init__(self, hidden_size: int, num_mono_types: int = 70):
+        super().__init__()
+        self.mono_embeddings = nn.Embedding(num_mono_types, hidden_size)
+        self.mono_vocab = MONOSACCHARIDE_VOCAB
+        self.hidden_size = hidden_size
+    def forward(
+        self,
+        token_embeddings: torch.Tensor,  # (batch, seq_len, hidden)
+        residue_ids: torch.Tensor,        # (batch, seq_len)
+        mono_type_ids: torch.Tensor = None,  # (batch, max_residues) - monosaccharide type per residue
+    ) -> torch.Tensor:
+        """
+        Add residue type embeddings to token embeddings.
+        Args:
+            token_embeddings: Base token embeddings
+            residue_ids: Which residue each token belongs to (-1 for special tokens)
+            mono_type_ids: Monosaccharide type ID for each residue position
+        Returns:
+            Enhanced embeddings with residue type information
+        """
+        if mono_type_ids is None:
+            return token_embeddings
+        batch_size, seq_len, _ = token_embeddings.shape
+        enhanced = token_embeddings.clone()
+        # Add mono type embedding to each token based on its residue
+        for b in range(batch_size):
+            for pos in range(seq_len):
+                rid = residue_ids[b, pos].item()
+                if rid >= 0 and rid < mono_type_ids.size(1):
+                    mono_id = mono_type_ids[b, rid]
+                    enhanced[b, pos] = enhanced[b, pos] + self.mono_embeddings(mono_id)
+        return enhanced
+    @staticmethod
+    def get_mono_type_id(mono_name: str) -> int:
+        """Convert monosaccharide name to type ID."""
+        return MONOSACCHARIDE_VOCAB.get(mono_name, MONOSACCHARIDE_VOCAB['[UNK_MONO]'])
+# =============================================================================
+# Improvement #4: Relative Position Encoding for Glycan Trees
+# =============================================================================
+class RelativePositionBias(nn.Module):
+    """
+    Compute relative position bias for attention based on residue IDs.
+    Tokens in the same residue get distance 0.
+    Tokens in adjacent residues get distance ±1.
+    This helps the model understand glycan tree structure.
+    """
+    def __init__(self, num_heads: int, max_distance: int = 10):
+        super().__init__()
+        self.num_heads = num_heads
+        self.max_distance = max_distance
+        # Learnable bias for each relative distance (-max to +max)
+        num_distances = 2 * max_distance + 1
+        self.relative_bias = nn.Embedding(num_distances, num_heads)
+    def forward(self, residue_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Compute relative position bias.
+        Args:
+            residue_ids: (batch, seq_len)
+        Returns:
+            Bias to add to attention scores: (batch, num_heads, seq_len, seq_len)
+        """
+        # Compute pairwise residue distances
+        # (batch, seq_len, 1) - (batch, 1, seq_len) = (batch, seq_len, seq_len)
+        distance = residue_ids.unsqueeze(2) - residue_ids.unsqueeze(1)
+        # Clamp to max distance range and shift to 0-indexed
+        distance_clamped = distance.clamp(-self.max_distance, self.max_distance)
+        distance_idx = distance_clamped + self.max_distance  # Now 0 to 2*max_distance
+        # Look up bias: (batch, seq_len, seq_len, num_heads)
+        bias = self.relative_bias(distance_idx)
+        # Transpose to (batch, num_heads, seq_len, seq_len)
+        bias = bias.permute(0, 3, 1, 2)
+        return bias
+class CrossAttentionLayer(nn.Module):
+    """
+    Cross-attention layer for sequence-structure alignment.
+    Allows sequence tokens to attend to structural atoms using attention masks.
+    """
+    def __init__(self, config: MultimodalGlycanBERTConfig):
+        super().__init__()
+        self.num_heads = config.cross_attn_num_heads
+        self.hidden_size = config.seq_hidden_size
+        self.head_dim = self.hidden_size // self.num_heads
+        assert self.hidden_size % self.num_heads == 0, "hidden_size must be divisible by num_heads"
+        # Query from sequence, Key/Value from structure (VQ-VAE tokens)
+        self.query = nn.Linear(config.seq_hidden_size, self.hidden_size)
+        self.key = nn.Linear(config.struct_hidden_size, self.hidden_size)
+        self.value = nn.Linear(config.struct_hidden_size, self.hidden_size)
+        self.output = nn.Linear(self.hidden_size, config.seq_hidden_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.layer_norm = nn.LayerNorm(config.seq_hidden_size, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        seq_hidden: torch.Tensor,  # (batch, seq_len, seq_hidden)
+        struct_hidden: torch.Tensor,  # (batch, struct_len, struct_hidden)
+        attention_mask: Optional[torch.Tensor] = None,  # (batch, seq_len, struct_len)
+    ) -> torch.Tensor:
+        """
+        Apply cross-attention from sequence to structure.
+        Args:
+            seq_hidden: Sequence hidden states
+            struct_hidden: Structure hidden states
+            attention_mask: Boolean mask (True = can attend, False = cannot attend)
+        Returns:
+            Updated sequence hidden states
+        """
+        batch_size, seq_len, _ = seq_hidden.shape
+        struct_len = struct_hidden.shape[1]
+        # Project to Q, K, V
+        Q = self.query(seq_hidden)  # (batch, seq_len, hidden)
+        K = self.key(struct_hidden)  # (batch, struct_len, hidden)
+        V = self.value(struct_hidden)  # (batch, struct_len, hidden)
+        # Reshape for multi-head attention
+        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)  # (batch, heads, seq_len, head_dim)
+        K = K.view(batch_size, struct_len, self.num_heads, self.head_dim).transpose(1, 2)  # (batch, heads, struct_len, head_dim)
+        V = V.view(batch_size, struct_len, self.num_heads, self.head_dim).transpose(1, 2)  # (batch, heads, struct_len, head_dim)
+        # Compute attention scores
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)  # (batch, heads, seq_len, struct_len)
+        # Apply attention mask
+        if attention_mask is not None:
+            # attention_mask: (batch, seq_len, struct_len) -> (batch, 1, seq_len, struct_len)
+            attention_mask = attention_mask.unsqueeze(1)
+            # Convert boolean mask to float: True -> 0.0, False -> -10000.0
+            attention_mask = (~attention_mask).float() * -10000.0
+            scores = scores + attention_mask
+        # Softmax and dropout
+        attn_weights = torch.softmax(scores, dim=-1)  # (batch, heads, seq_len, struct_len)
+        attn_weights = self.dropout(attn_weights)
+        # Apply attention to values
+        context = torch.matmul(attn_weights, V)  # (batch, heads, seq_len, head_dim)
+        # Reshape back
+        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_size)
+        # Output projection
+        output = self.output(context)
+        output = self.dropout(output)
+        # Residual connection + layer norm
+        output = self.layer_norm(seq_hidden + output)
+        return output
+class MultimodalGlycanBERT(nn.Module):
+    """
+    Multimodal BERT for glycan representation learning (v3).
+    Architecture:
+    1. Separate encoders for each modality (sequence, MS, 3D structure)
+    2. Cross-attention for sequence-structure alignment
+    3. Modality-specific MLM heads
+    4. Fusion layer for combined representation
+    """
+    def __init__(self, config: MultimodalGlycanBERTConfig):
+        super().__init__()
+        self.config = config
+        # ===== Sequence Encoder =====
+        seq_config = config.to_seq_config()
+        seq_config.cnn_kernel_size = config.cnn_kernel_size
+        if config.use_cnn_frontend:
+            print(f"✅ Enabled Convolutional Front-End (Kernel={config.cnn_kernel_size})")
+            self.seq_embeddings = ConvGlycanBERTEmbeddings(seq_config)
+        else:
+            self.seq_embeddings = GlycanBERTEmbeddings(seq_config)
+        self.seq_layers = nn.ModuleList([GlycanBERTLayer(seq_config) for _ in range(seq_config.num_hidden_layers)])
+        self.seq_mlm_head = nn.Linear(seq_config.hidden_size, seq_config.vocab_size)
+        # ===== MS Encoder =====
+        ms_config = config.to_ms_config()
+        self.ms_embeddings = GlycanBERTEmbeddings(ms_config)
+        self.ms_layers = nn.ModuleList([GlycanBERTLayer(ms_config) for _ in range(ms_config.num_hidden_layers)])
+        self.ms_mlm_head = nn.Linear(ms_config.hidden_size, ms_config.vocab_size)
+        # ===== Structure Encoder (VQ-VAE tokens) =====
+        if config.use_3d:
+            struct_config = config.to_struct_config()
+            self.struct_embeddings = GlycanBERTEmbeddings(struct_config)
+            self.struct_layers = nn.ModuleList([GlycanBERTLayer(struct_config) for _ in range(struct_config.num_hidden_layers)])
+            self.struct_mlm_head = nn.Linear(struct_config.hidden_size, struct_config.vocab_size)
+            # Cross-attention layer (sequence → VQ-VAE structural tokens)
+            if config.use_cross_attention:
+                self.cross_attention = CrossAttentionLayer(config)
+        # ===== Projection layers (align hidden sizes) =====
+        if config.ms_hidden_size != config.seq_hidden_size:
+            self.ms_projection = nn.Linear(config.ms_hidden_size, config.seq_hidden_size)
+        else:
+            self.ms_projection = nn.Identity()
+        if config.use_3d and config.struct_hidden_size != config.seq_hidden_size:
+            self.struct_projection = nn.Linear(config.struct_hidden_size, config.seq_hidden_size)
+        else:
+            self.struct_projection = nn.Identity()
+        # ===== Fusion Layer =====
+        # Concatenate seq + ms + struct
+        fusion_input_size = config.seq_hidden_size * (3 if config.use_3d else 2)
+        self.fusion_layer = nn.Sequential(
+            nn.Linear(fusion_input_size, config.fusion_hidden_size),
+            nn.LayerNorm(config.fusion_hidden_size, eps=config.layer_norm_eps),
+            nn.GELU(),
+            nn.Dropout(config.hidden_dropout_prob),
+            nn.Linear(config.fusion_hidden_size, config.fusion_hidden_size),
+        )
+        # ===== Distance Prediction Head (Topology) =====
+        # OPTIMIZED: Project down to 128 dim first to save GPU memory
+        # (Batch, 256, 256, 768) -> (Batch, 256, 256, 128) reduces memory by 6x
+        self.dist_proj = nn.Linear(config.seq_hidden_size, 128)
+        self.distance_head = nn.Sequential(
+            nn.Linear(128, 64),
+            nn.ReLU(),
+            nn.Linear(64, 1)
+        )
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """Initialize weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(
+        self,
+        seq_token_ids: torch.Tensor,
+        seq_attention_mask: torch.Tensor,
+        seq_residue_ids: torch.Tensor,
+        seq_branch_depths: Optional[torch.Tensor] = None,  # NEW: Branch depths
+        seq_linkage_types: Optional[torch.Tensor] = None,  # NEW: Linkage types
+        ms_token_ids: torch.Tensor = None,
+        ms_attention_mask: torch.Tensor = None,
+        has_ms: torch.Tensor = None,
+        struct_token_ids: Optional[torch.Tensor] = None,
+        struct_attention_mask: Optional[torch.Tensor] = None,
+        struct_residue_ids: Optional[torch.Tensor] = None,
+        has_3d: Optional[torch.Tensor] = None,
+        seq_labels: Optional[torch.Tensor] = None,
+        ms_labels: Optional[torch.Tensor] = None,
+        struct_labels: Optional[torch.Tensor] = None,
+        dist_labels: Optional[torch.Tensor] = None,  # NEW: Topology distance labels
+        return_dict: bool = True,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass for multimodal BERT v3.
+        Args:
+            seq_token_ids: (batch_size, seq_len) - Sequence token IDs
+            seq_attention_mask: (batch_size, seq_len) - Sequence attention mask
+            seq_residue_ids: (batch_size, seq_len) - Sequence token residue IDs
+            ms_token_ids: (batch_size, ms_len) - MS token IDs
+            ms_attention_mask: (batch_size, ms_len) - MS attention mask
+            has_ms: (batch_size,) - Boolean mask for samples with MS data
+            struct_token_ids: (batch_size, struct_len) - Structure VQ-VAE token IDs (optional)
+            struct_attention_mask: (batch_size, struct_len) - Structure attention mask (optional)
+            struct_residue_ids: (batch_size, struct_len) - Structure token residue IDs (optional)
+            has_3d: (batch_size,) - Boolean mask for samples with 3D data (optional)
+            seq_labels: (batch_size, seq_len) - Masked sequence labels (optional)
+            ms_labels: (batch_size, ms_len) - Masked MS labels (optional)
+            struct_labels: (batch_size, struct_len) - Masked structure labels (optional)
+            return_dict: Whether to return dict or tuple
+        Returns:
+            Dictionary containing logits, hidden states, losses, etc.
+        """
+        batch_size = seq_token_ids.shape[0]
+        device = seq_token_ids.device
+        # ===== Sequence Encoder =====
+        # Pass branch_depths and linkage_types to embeddings for tree-aware encoding
+        seq_hidden = self.seq_embeddings(seq_token_ids, seq_branch_depths, seq_linkage_types)
+        for layer in self.seq_layers:
+            seq_hidden = layer(seq_hidden, seq_attention_mask)
+        seq_pooled = seq_hidden[:, 0, :]  # [CLS] token
+        seq_logits = self.seq_mlm_head(seq_hidden)
+        # ===== Distance Predictions (Topology) =====
+        # Compute pairwise distance predictions
+        # MEMORY OPTIMIZATION: Project to 128-dim first
+        seq_hidden_small = self.dist_proj(seq_hidden) # (batch, seq_len, 128)
+        # Expand for pairwise: (batch, seq_len, 1, 128) - (batch, 1, seq_len, 128)
+        h_i = seq_hidden_small.unsqueeze(2)
+        h_j = seq_hidden_small.unsqueeze(1)
+        h_diff = torch.abs(h_i - h_j)  # (batch, seq_len, seq_len, 128) - Much smaller!
+        dist_predictions = self.distance_head(h_diff)  # (batch, seq_len, seq_len, 1)
+        # ===== MS Encoder =====
+        ms_hidden = None
+        ms_pooled = None
+        ms_logits = None
+        if ms_token_ids is not None:
+            ms_hidden = self.ms_embeddings(ms_token_ids)
+            for layer in self.ms_layers:
+                ms_hidden = layer(ms_hidden, ms_attention_mask)
+            ms_pooled = ms_hidden[:, 0, :]  # [CLS] token
+            ms_logits = self.ms_mlm_head(ms_hidden)
+            # Zero out MS representations for samples without MS data
+            if has_ms is not None:
+                has_ms_expanded = has_ms.unsqueeze(1).float()  # (batch, 1)
+                ms_pooled = ms_pooled * has_ms_expanded
+        # ===== Structure Encoder =====
+        struct_pooled = None
+        struct_logits = None
+        struct_hidden = None
+        if self.config.use_3d and struct_token_ids is not None:
+            struct_hidden = self.struct_embeddings(struct_token_ids)
+            for layer in self.struct_layers:
+                struct_hidden = layer(struct_hidden, struct_attention_mask)
+            struct_pooled = struct_hidden[:, 0, :]  # [CLS] token
+            struct_logits = self.struct_mlm_head(struct_hidden)
+            # Zero out structure representations for samples without 3D data
+            if has_3d is not None:
+                has_3d_expanded = has_3d.unsqueeze(1).float()  # (batch, 1)
+                struct_pooled = struct_pooled * has_3d_expanded
+            # ===== Cross-Attention (Sequence → VQ-VAE Structural Tokens) =====
+            # Use residue-level alignment between WURCS tokens and VQ-VAE tokens
+            if self.config.use_cross_attention and struct_residue_ids is not None:
+                # Create residue-level mask
+                # WURCS token with residue_id=0 → VQ-VAE tokens with residue_id=0
+                residue_mask = create_residue_level_mask(
+                    seq_residue_ids=seq_residue_ids,
+                    struct_residue_ids=struct_residue_ids,
+                )  # (batch, N_seq, N_struct)
+                # Apply cross-attention: sequence tokens attend to VQ-VAE tokens
+                seq_hidden = self.cross_attention(
+                    seq_hidden=seq_hidden,
+                    struct_hidden=struct_hidden,  # VQ-VAE token features
+                    attention_mask=residue_mask,  # Residue-based mask
+                )
+                # Update seq_pooled after cross-attention
+                seq_pooled = seq_hidden[:, 0, :]
+        # ===== Fusion =====
+        # Project to common hidden size
+        ms_pooled_projected = self.ms_projection(ms_pooled)
+        if self.config.use_3d and struct_pooled is not None:
+            struct_pooled_projected = self.struct_projection(struct_pooled)
+            combined = torch.cat([seq_pooled, ms_pooled_projected, struct_pooled_projected], dim=-1)
+        else:
+            combined = torch.cat([seq_pooled, ms_pooled_projected], dim=-1)
+        fused_repr = self.fusion_layer(combined)
+        # ===== Compute Losses =====
+        total_loss = None
+        seq_loss = None
+        ms_loss = None
+        struct_loss = None
+        dist_loss = None  # NEW: Topology distance loss
+        if seq_labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            seq_loss = loss_fct(
+                seq_logits.view(-1, self.config.seq_vocab_size),
+                seq_labels.view(-1)
+            )
+        if ms_labels is not None:
+            ms_labels_masked = ms_labels.clone()
+            ms_labels_masked[~has_ms] = -100
+            # Only compute loss if there are valid labels (not all -100)
+            if (ms_labels_masked != -100).any():
+                loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+                ms_loss = loss_fct(
+                    ms_logits.view(-1, self.config.ms_total_vocab_size),
+                    ms_labels_masked.view(-1)
+                )
+            else:
+                ms_loss = torch.tensor(0.0, device=seq_token_ids.device)
+        if self.config.use_3d and struct_labels is not None and struct_logits is not None:
+            struct_labels_masked = struct_labels.clone()
+            if has_3d is not None:
+                struct_labels_masked[~has_3d] = -100
+            # Only compute loss if there are valid labels (not all -100)
+            if (struct_labels_masked != -100).any():
+                loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+                struct_loss = loss_fct(
+                    struct_logits.view(-1, self.config.struct_vocab_size),
+                    struct_labels_masked.view(-1)
+                )
+            else:
+                struct_loss = torch.tensor(0.0, device=seq_token_ids.device)
+        # ===== Distance Loss (Topology) =====
+        if dist_labels is not None:
+            # dist_predictions: (Batch, Seq, Seq, 1) -> (Batch, Seq, Seq)
+            preds = dist_predictions.squeeze(-1)
+            # Create mask for valid distance pairs (label != -1)
+            # Also respect attention mask to avoid padding
+            valid_mask = (dist_labels != -1) & (seq_attention_mask.unsqueeze(1) * seq_attention_mask.unsqueeze(2) == 1)
+            # DEBUG: Print once
+            if not hasattr(self, '_dist_debug_printed'):
+                print(f"[DIST DEBUG] dist_labels shape: {dist_labels.shape}, valid_mask.sum: {valid_mask.sum().item()}")
+                self._dist_debug_printed = True
+            if valid_mask.sum() > 0:
+                # MSE loss on valid positions only
+                loss_fct = nn.MSELoss()
+                dist_loss = loss_fct(preds[valid_mask], dist_labels[valid_mask].float())
+            else:
+                dist_loss = torch.tensor(0.0, device=seq_token_ids.device)
+        else:
+            # DEBUG: dist_labels is None
+            if not hasattr(self, '_dist_none_printed'):
+                print("[DIST DEBUG] dist_labels is None!")
+                self._dist_none_printed = True
+        # Weighted combination
+        losses = []
+        if seq_loss is not None:
+            losses.append(self.config.seq_loss_weight * seq_loss)
+        if ms_loss is not None:
+            losses.append(self.config.ms_loss_weight * ms_loss)
+        if struct_loss is not None:
+            losses.append(self.config.struct_loss_weight * struct_loss)
+        if dist_loss is not None:
+            losses.append(self.config.dist_loss_weight * dist_loss)
+        if losses:
+            total_loss = sum(losses)
+        if return_dict:
+            return {
+                'loss': total_loss,
+                'seq_loss': seq_loss,
+                'ms_loss': ms_loss,
+                'struct_loss': struct_loss,
+                'dist_loss': dist_loss,  # NEW: Topology loss
+                'seq_logits': seq_logits,
+                'ms_logits': ms_logits,
+                'struct_logits': struct_logits,
+                'dist_predictions': dist_predictions,  # NEW: Distance predictions
+                'seq_hidden': seq_hidden,
+                'ms_hidden': ms_hidden,
+                'struct_hidden': struct_hidden,
+                'seq_pooled': seq_pooled,
+                'ms_pooled': ms_pooled,
+                'struct_pooled': struct_pooled,
+                'fused_repr': fused_repr,
+            }
+        else:
+            return (total_loss, seq_logits, ms_logits, struct_logits, fused_repr)
+    def get_multimodal_representation(
+        self,
+        seq_token_ids: torch.Tensor,
+        seq_attention_mask: torch.Tensor,
+        seq_residue_ids: torch.Tensor,
+        ms_token_ids: torch.Tensor,
+        ms_attention_mask: torch.Tensor,
+        has_ms: torch.Tensor,
+        struct_token_ids: Optional[torch.Tensor] = None,
+        struct_attention_mask: Optional[torch.Tensor] = None,
+        struct_residue_ids: Optional[torch.Tensor] = None,
+        has_3d: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Get fused multimodal representation (for inference)."""
+        outputs = self.forward(
+            seq_token_ids=seq_token_ids,
+            seq_attention_mask=seq_attention_mask,
+            seq_residue_ids=seq_residue_ids,
+            ms_token_ids=ms_token_ids,
+            ms_attention_mask=ms_attention_mask,
+            has_ms=has_ms,
+            struct_token_ids=struct_token_ids,
+            struct_attention_mask=struct_attention_mask,
+            struct_residue_ids=struct_residue_ids,
+            has_3d=has_3d,
+            return_dict=True,
+        )
+        return outputs['fused_repr']
+if __name__ == "__main__":
+    # Test the model
+    print("="*80)
+    print("Testing Multimodal GlycanBERT v3")
+    print("="*80)
+    # Create config
+    config = MultimodalGlycanBERTConfig(
+        seq_vocab_size=166,
+        seq_hidden_size=768,
+        seq_num_layers=12,
+        seq_num_heads=12,
+        ms_vocab_size=242,
+        ms_hidden_size=384,
+        ms_num_layers=6,
+        ms_num_heads=6,
+        struct_vocab_size=1024,
+        struct_hidden_size=512,
+        struct_num_layers=8,
+        struct_num_heads=8,
+        use_3d=True,
+        use_cross_attention=True,
+        seq_loss_weight=0.60,
+        ms_loss_weight=0.15,
+        struct_loss_weight=0.25,
+    )
+    print(f"\nConfig:")
+    print(f"  Sequence vocab: {config.seq_vocab_size}")
+    print(f"  MS vocab: {config.ms_vocab_size}")
+    print(f"  Structure vocab: {config.struct_vocab_size}")
+    print(f"  Loss weights: seq={config.seq_loss_weight}, ms={config.ms_loss_weight}, struct={config.struct_loss_weight}")
+    # Create model
+    model = MultimodalGlycanBERT(config)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\nModel Parameters:")
+    print(f"  Total: {total_params:,}")
+    print(f"  Trainable: {trainable_params:,}")
+    # Test forward pass
+    print(f"\n{'='*80}")
+    print("Testing Forward Pass (with Conv front-end)")
+    print("="*80)
+    batch_size = 4
+    seq_len = 128
+    ms_len = 50
+    struct_len = 40
+    # Create dummy inputs
+    seq_token_ids = torch.randint(0, config.seq_vocab_size, (batch_size, seq_len))
+    seq_attention_mask = torch.ones(batch_size, seq_len)
+    # Approximate: ~5 tokens per residue
+    seq_residue_ids = torch.div(
+        torch.arange(seq_len), 5, rounding_mode="floor"
+    ).unsqueeze(0).expand(batch_size, -1)
+    ms_token_ids = torch.randint(config.ms_vocab_offset, config.ms_total_vocab_size, (batch_size, ms_len))
+    ms_attention_mask = torch.ones(batch_size, ms_len)
+    struct_token_ids = torch.randint(0, config.struct_vocab_size, (batch_size, struct_len))
+    struct_attention_mask = torch.ones(batch_size, struct_len)
+    # Approximate: 4 tokens per residue for VQ-VAE tokens
+    struct_residue_ids = torch.div(
+        torch.arange(struct_len), 4, rounding_mode="floor"
+    ).unsqueeze(0).expand(batch_size, -1)
+    has_ms = torch.tensor([True, True, False, True])
+    has_3d = torch.tensor([True, False, True, True])
+    # Create labels for MLM
+    seq_labels = seq_token_ids.clone()
+    seq_labels[seq_labels != config.mask_token_id] = -100
+    ms_labels = ms_token_ids.clone()
+    ms_labels[ms_labels != config.mask_token_id] = -100
+    struct_labels = struct_token_ids.clone()
+    struct_labels[struct_labels != config.mask_token_id] = -100
+    # Forward pass
+    outputs = model(
+        seq_token_ids=seq_token_ids,
+        seq_attention_mask=seq_attention_mask,
+        seq_residue_ids=seq_residue_ids,
+        ms_token_ids=ms_token_ids,
+        ms_attention_mask=ms_attention_mask,
+        has_ms=has_ms,
+        struct_token_ids=struct_token_ids,
+        struct_attention_mask=struct_attention_mask,
+        struct_residue_ids=struct_residue_ids,
+        has_3d=has_3d,
+        seq_labels=seq_labels,
+        ms_labels=ms_labels,
+        struct_labels=struct_labels,
+    )
+    print(f"\nOutput shapes:")
+    print(f"  seq_logits: {outputs['seq_logits'].shape}")
+    print(f"  ms_logits: {outputs['ms_logits'].shape}")
+    print(f"  struct_logits: {outputs['struct_logits'].shape}")
+    print(f"  fused_repr: {outputs['fused_repr'].shape}")
+    print(f"\nLosses:")
+    print(f"  Total loss: {outputs['loss'].item():.4f}")
+    print(f"  Sequence loss: {outputs['seq_loss'].item():.4f}")
+    print(f"  MS loss: {outputs['ms_loss'].item():.4f}")
+    print(f"  Structure loss: {outputs['struct_loss'].item():.4f}")
+    print(f"\n{'='*80}")
+    print("Model Test Complete!")
+    print("="*80)

src/wurcs_bpe_tokenizer.py ADDED Viewed

	@@ -0,0 +1,740 @@

+#!/usr/bin/env python3
+"""
+WURCS-BPE Tokenizer
+A hybrid tokenizer that learns semantic subwords from WURCS while preserving
+the ability to handle rare/novel glycan structures character-by-character.
+Key features:
+1. Pre-tokenization: Split WURCS into semantic units (residues, linkages, mods)
+2. BPE: Learn subword merges from corpus
+3. Character fallback: Handle novel structures
+4. Tree embeddings: Preserve branch_depth and linkage_type per token
+Usage:
+    # Train BPE on corpus
+    tokenizer = WURCSBPETokenizer.train_from_corpus(
+        wurcs_strings,
+        num_merges=500,
+        output_path="bpe_vocabulary.json"
+    )
+    # Tokenize
+    result = tokenizer.tokenize(wurcs_string)
+"""
+import json
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Set
+import pickle
+class WURCSPreTokenizer:
+    """
+    Pre-tokenize WURCS into semantic units before BPE.
+    WURCS format: WURCS=2.0/count/[residues]/indices/linkages
+    We split into:
+    - Residues: [a2122h-1b_1-5_2*NCC/3=O] -> one unit per []
+    - Linkages: a4-b1 -> one unit per linkage
+    - Special markers: [BRANCH_OPEN], [BRANCH_CLOSE], etc.
+    """
+    # Residue patterns for common monosaccharides
+    RESIDUE_PATTERN = re.compile(r'\[([^\]]+)\]')
+    LINKAGE_PATTERN = re.compile(r'([a-z])(\d+|\?)-([a-z])(\d+|\?)')
+    def __init__(self):
+        self.special_tokens = {
+            '[PAD]': 0,
+            '[UNK]': 1,
+            '[START]': 2,
+            '[END]': 3,
+            '[MASK]': 4,
+            '[BRANCH_OPEN]': 5,
+            '[BRANCH_CLOSE]': 6,
+            '[LINK]': 7,
+            '[MOD]': 8,
+            '[RESIDUE_ERROR]': 9,
+        }
+    def pre_tokenize(self, wurcs: str) -> List[Dict]:
+        """
+        Pre-tokenize WURCS into semantic units.
+        Returns list of dicts with:
+            - text: The unit text
+            - type: 'special', 'residue', 'linkage', 'mod', 'index'
+            - residue_id: Which residue this belongs to (-1 for special, -2 for linkage)
+            - branch_depth: Tree depth (computed later)
+        """
+        units = []
+        # Add start token
+        units.append({
+            'text': '[START]',
+            'type': 'special',
+            'residue_id': -1,
+            'branch_depth': 0,
+            'linkage_type': 0,
+        })
+        # Parse WURCS sections
+        if not wurcs.startswith('WURCS='):
+            units.append({'text': '[RESIDUE_ERROR]', 'type': 'special', 'residue_id': -1, 'branch_depth': 0, 'linkage_type': 0})
+            units.append({'text': '[END]', 'type': 'special', 'residue_id': -1, 'branch_depth': 0, 'linkage_type': 0})
+            return units
+        try:
+            parts = self._split_wurcs_sections(wurcs)
+            if len(parts) < 4:
+                return [{'text': '[ERROR]', 'type': 'special', 'residue_id': -1, 'branch_depth': 0, 'linkage_type': 0}]
+            # parts: WURCS=2.0/3,3,2/[a2122h-1b_1-5][a2122h-1a_1-5][a1122h-1b_1-5]/1-2-3-1/a4-b1_b3-c1_c4-d1
+            # section 2: residue definitions
+            # section 3: indices
+            # section 4: linkages (optional)
+            version = parts[0]  # WURCS=2.0
+            counts = parts[1]   # residue_count,node_count,link_count
+            residue_defs = parts[2]  # [res1][res2]...
+            indices = parts[3]  # 1-2-3-1
+            linkages = parts[4] if len(parts) > 4 else ""  # a4-b1_b3-c1
+            # Parse residue definitions
+            residue_list = self.RESIDUE_PATTERN.findall(residue_defs)
+            # Parse linkages to compute branch structure
+            linkage_list = linkages.split('_') if linkages else []
+            branch_points, residue_depths, linkage_types_map, adj = self._analyze_tree_structure(linkage_list, num_residues=len(residue_list))
+            # Compute distance matrix and cache it based on the linkage string (structure)
+            # This is the most expensive part, so we cache it
+            if not hasattr(self, '_dist_cache'): self._dist_cache = {}
+            if linkages not in self._dist_cache:
+                self._dist_cache[linkages] = self._compute_distance_matrix(adj, len(residue_list))
+            dist_matrix_raw = self._dist_cache[linkages]
+            # Parse indices to map positions to residue definitions
+            index_list = indices.split('-') if indices else []
+            # Process each residue instance
+            residue_letter = ord('a')
+            for idx, res_idx in enumerate(index_list):
+                current_residue_id = idx
+                res_letter = chr(residue_letter + idx)
+                # Check if this is a branch point - add branch marker before
+                if res_letter in branch_points and branch_points[res_letter] > 0:
+                    for _ in range(branch_points[res_letter]):
+                        units.append({
+                            'text': '[BRANCH_OPEN]',
+                            'type': 'special',
+                            'residue_id': -1,
+                            'branch_depth': residue_depths.get(res_letter, 0),
+                            'linkage_type': 0,
+                        })
+                # Get residue definition
+                try:
+                    res_def_idx = int(res_idx) - 1  # 1-indexed to 0-indexed
+                    res_def = residue_list[res_def_idx] if res_def_idx < len(residue_list) else ""
+                except (ValueError, IndexError):
+                    res_def = ""
+                # Split residue into base and modifications
+                res_parts = res_def.split('_')
+                base = res_parts[0] if res_parts else res_def
+                mods = res_parts[1:] if len(res_parts) > 1 else []
+                # Add residue base as a single unit
+                depth = residue_depths.get(res_letter, 0)
+                units.append({
+                    'text': base,
+                    'type': 'residue',
+                    'residue_id': current_residue_id,
+                    'branch_depth': depth,
+                    'linkage_type': 0,
+                })
+                # Add modifications
+                for mod in mods:
+                    units.append({
+                        'text': mod,
+                        'type': 'mod',
+                        'residue_id': current_residue_id,
+                        'branch_depth': depth,
+                        'linkage_type': 0,
+                    })
+            # Store distance matrix in units for easy access in tokenizer
+            if units:
+                # Find first residue unit or just use START
+                units[0]['distance_matrix'] = dist_matrix_raw
+            # Add linkages
+            for link in linkage_list:
+                if not link:
+                    continue
+                # Parse linkage type
+                lt = self._parse_linkage_type(link)
+                units.append({
+                    'text': link,
+                    'type': 'linkage',
+                    'residue_id': -2,
+                    'branch_depth': 0,
+                    'linkage_type': lt,
+                })
+        except Exception:
+            # Fallback for truly broken WURCS
+            pass
+        # Add end token
+        units.append({
+            'text': '[END]',
+            'type': 'special',
+            'residue_id': -1,
+            'branch_depth': 0,
+            'linkage_type': 0,
+        })
+        return units
+    def _split_wurcs_sections(self, wurcs: str) -> List[str]:
+        """Split WURCS string into sections, handling nested brackets."""
+        # Remove WURCS= prefix
+        if wurcs.startswith('WURCS='):
+            wurcs = wurcs[6:]
+        sections = []
+        current = ""
+        bracket_depth = 0
+        for char in wurcs:
+            if char == '[':
+                bracket_depth += 1
+                current += char
+            elif char == ']':
+                bracket_depth -= 1
+                current += char
+            elif char == '/' and bracket_depth == 0:
+                sections.append(current)
+                current = ""
+            else:
+                current += char
+        if current:
+            sections.append(current)
+        return sections
+    def _analyze_tree_structure(self, linkages: List[str], num_residues: int) -> Tuple[Dict, Dict, Dict, Dict]:
+        """Analyze linkages to compute branch points and residue depths."""
+        branch_points = defaultdict(int)  # residue -> number of children
+        children = defaultdict(list)
+        all_residues = set()
+        linkage_types = {}
+        for link in linkages:
+            match = self.LINKAGE_PATTERN.match(link)
+            if match:
+                from_res, from_pos, to_res, to_pos = match.groups()
+                children[from_res].append(to_res)
+                all_residues.add(from_res)
+                all_residues.add(to_res)
+                # Store linkage type
+                linkage_types[link] = self._parse_linkage_type(link)
+        # Build adjacency list for BFS
+        adj = defaultdict(list)
+        for link in linkages:
+            match = self.LINKAGE_PATTERN.match(link)
+            if match:
+                u = ord(match.group(1)) - ord('a')
+                v = ord(match.group(3)) - ord('a')
+                if 0 <= u < num_residues and 0 <= v < num_residues:
+                    adj[u].append(v)
+                    adj[v].append(u)
+        # Find branch points (residues with >1 child)
+        for res, kids in children.items():
+            if len(kids) > 1:
+                branch_points[res] = len(kids) - 1  # Number of extra branches
+        # Compute depths using BFS
+        # Find root (residue with no parent)
+        child_set = set()
+        for kids in children.values():
+            child_set.update(kids)
+        roots = all_residues - child_set
+        root = min(roots) if roots else 'a'
+        depths = {root: 0}
+        queue = [root]
+        while queue:
+            current = queue.pop(0)
+            for child in children.get(current, []):
+                if child not in depths:
+                    depths[child] = depths[current] + 1
+                    queue.append(child)
+        return branch_points, depths, linkage_types, adj
+    def _compute_distance_matrix(self, adj: Dict[int, List[int]], num_residues: int) -> List[List[int]]:
+        """
+        Compute shortest path distance (number of bonds) between all residue pairs using BFS.
+        """
+        if num_residues == 0:
+            return []
+        dist_matrix = [[-1] * num_residues for _ in range(num_residues)]
+        for i in range(num_residues):
+            dist_matrix[i][i] = 0
+            queue = [(i, 0)]
+            visited = {i}
+            while queue:
+                curr, d = queue.pop(0)
+                dist_matrix[i][curr] = d
+                for neighbor in adj[curr]:
+                    if neighbor not in visited:
+                        visited.add(neighbor)
+                        queue.append((neighbor, d + 1))
+        return dist_matrix
+    def _compute_distance_matrix_OLD(self, linkages: List[str], num_residues: int) -> List[List[int]]:
+        """
+        Compute shortest path distance (number of bonds) between all residue pairs.
+        Returns a symmetric N x N matrix where N is num_residues.
+        Values are integers (number of steps). 0 on diagonal. -1 if unreachable (shouldn't happen in single tree).
+        """
+        if num_residues == 0:
+            return []
+        # Initialize adjacency list
+        adj = defaultdict(list)
+        for link in linkages:
+            match = self.LINKAGE_PATTERN.match(link)
+            if match:
+                # WURCS indices are 1-based letters (a=1, b=2...)
+                from_res_char, _, to_res_char, _ = match.groups()
+                # Convert char to 0-based index
+                u = ord(from_res_char) - ord('a')
+                v = ord(to_res_char) - ord('a')
+                # Undirected graph for structural distance
+                if 0 <= u < num_residues and 0 <= v < num_residues:
+                    adj[u].append(v)
+                    adj[v].append(u)
+        # Compute All-Pairs Shortest Path (BFS from each node is fine for small N)
+        # Glycans are small (N ~ 5-20 usually), so O(N^2) BFS is cheap.
+        dist_matrix = [[-1] * num_residues for _ in range(num_residues)]
+        for i in range(num_residues):
+            dist_matrix[i][i] = 0
+            queue = [(i, 0)]
+            visited = {i}
+            while queue:
+                curr, d = queue.pop(0)
+                dist_matrix[i][curr] = d
+                for neighbor in adj[curr]:
+                    if neighbor not in visited:
+                        visited.add(neighbor)
+                        queue.append((neighbor, d + 1))
+        return dist_matrix
+    def _parse_linkage_type(self, link: str) -> int:
+        """Parse linkage string to get type ID."""
+        LINKAGE_TYPES = {
+            (1, 2): 0, (2, 1): 0,
+            (1, 3): 1, (3, 1): 1,
+            (1, 4): 2, (4, 1): 2,
+            (1, 6): 3, (6, 1): 3,
+            (2, 3): 4, (3, 2): 4,
+            (2, 6): 5, (6, 2): 5,
+            (3, 6): 6, (6, 3): 6,
+        }
+        match = self.LINKAGE_PATTERN.match(link)
+        if match:
+            _, from_pos, _, to_pos = match.groups()
+            try:
+                pos_tuple = (int(from_pos), int(to_pos))
+                return LINKAGE_TYPES.get(pos_tuple, 7)
+            except ValueError:
+                return 8  # Unknown
+        return 8
+class WURCSBPETokenizer:
+    """
+    BPE tokenizer for WURCS with tree-aware embeddings.
+    """
+    def __init__(self, vocab_path: Optional[str] = None):
+        self.pre_tokenizer = WURCSPreTokenizer()
+        # Special tokens (fixed)
+        self.special_tokens = self.pre_tokenizer.special_tokens
+        # BPE vocabulary
+        self.token_to_id: Dict[str, int] = {}
+        self.id_to_token: Dict[int, str] = {}
+        self.merges: List[Tuple[str, str]] = []
+        if vocab_path:
+            self.load_vocab(vocab_path)
+        else:
+            # Initialize with special tokens only
+            self.token_to_id = dict(self.special_tokens)
+            self.id_to_token = {v: k for k, v in self.token_to_id.items()}
+    @classmethod
+    def train_from_corpus(
+        cls,
+        wurcs_strings: List[str],
+        num_merges: int = 500,
+        output_path: Optional[str] = None,
+        min_frequency: int = 2,
+        max_token_length: Optional[int] = None,
+    ) -> 'WURCSBPETokenizer':
+        """
+        Train BPE on a corpus of WURCS strings.
+        Args:
+            wurcs_strings: List of WURCS strings
+            num_merges: Number of BPE merge operations
+            output_path: Optional path to save vocabulary
+            min_frequency: Minimum frequency for a token to be kept
+            max_token_length: Maximum length of a merged token (None = no limit)
+        Returns:
+            Trained tokenizer
+        """
+        tokenizer = cls()
+        pre_tok = WURCSPreTokenizer()
+        print(f"Training BPE on {len(wurcs_strings)} WURCS strings...")
+        # Step 1: Pre-tokenize all strings to get semantic units
+        all_units = []
+        for wurcs in wurcs_strings:
+            units = pre_tok.pre_tokenize(wurcs)
+            for unit in units:
+                if unit['type'] != 'special':
+                    all_units.append(unit['text'])
+        # Step 2: Count unit frequencies
+        unit_counts = Counter(all_units)
+        print(f"Found {len(unit_counts)} unique units")
+        # Step 3: Initialize vocabulary with characters from all units
+        char_vocab = set()
+        for unit in unit_counts:
+            for char in unit:
+                char_vocab.add(char)
+        # Build initial vocab: special tokens + characters
+        vocab_id = len(tokenizer.special_tokens)
+        for char in sorted(char_vocab):
+            tokenizer.token_to_id[char] = vocab_id
+            tokenizer.id_to_token[vocab_id] = char
+            vocab_id += 1
+        print(f"Initial vocab size: {vocab_id} (special + characters)")
+        # Step 4: Convert units to character sequences
+        word_freqs = {}
+        for unit, count in unit_counts.items():
+            if count >= min_frequency:
+                # Split into characters with space separator
+                chars = tuple(unit)
+                word_freqs[chars] = count
+        # Step 5: BPE merging
+        merges = []
+        for merge_idx in range(num_merges):
+            # Count pairs
+            pair_counts = Counter()
+            for word, freq in word_freqs.items():
+                for i in range(len(word) - 1):
+                    pair = (word[i], word[i + 1])
+                    pair_counts[pair] += freq
+            if not pair_counts:
+                break
+            # Find most frequent pair
+            best_pair = pair_counts.most_common(1)[0][0]
+            best_count = pair_counts[best_pair]
+            if best_count < min_frequency:
+                break
+            # Merge pair
+            new_token = best_pair[0] + best_pair[1]
+            # Check length constraint
+            if max_token_length and len(new_token) > max_token_length:
+                # remove this pair from consideration for this iteration and future?
+                # Actually, skipping it here is tricky because we need to ignore it in pair_counts next time
+                # Simpler: Just skip adding it to merges and modify word_freqs?
+                # No, if we don't merge, we just continue to the next best pair in THIS iteration.
+                # But pair_counts is already computed.
+                # We need to loop until we find a valid pair or run out
+                # In this simple implementation, let's just skip this merge efficiently
+                # We need to find the NEXT most common pair.
+                # Re-do finding best pair loop
+                found_valid_pair = False
+                for pair, count in pair_counts.most_common():
+                    token_candidate = pair[0] + pair[1]
+                    if max_token_length and len(token_candidate) > max_token_length:
+                        continue # Skip too long
+                    if count < min_frequency:
+                        break # Stop if frequency too low
+                    # Found valid pair
+                    best_pair = pair
+                    best_count = count
+                    new_token = token_candidate
+                    found_valid_pair = True
+                    break
+                if not found_valid_pair:
+                    print(f"  Stopping early: No more pairs satisfy max_token_length={max_token_length}")
+                    break
+            # Final check before merging (in case we didn't enter the if block but updated vars)
+            # Actually the logic above handles it. If we entered the block, we either found a new best_pair or broke.
+            merges.append(best_pair)
+            # Add to vocab
+            tokenizer.token_to_id[new_token] = vocab_id
+            tokenizer.id_to_token[vocab_id] = new_token
+            vocab_id += 1
+            # Update word_freqs
+            new_word_freqs = {}
+            for word, freq in word_freqs.items():
+                new_word = []
+                i = 0
+                while i < len(word):
+                    if i < len(word) - 1 and word[i] == best_pair[0] and word[i + 1] == best_pair[1]:
+                        new_word.append(new_token)
+                        i += 2
+                    else:
+                        new_word.append(word[i])
+                        i += 1
+                new_word_freqs[tuple(new_word)] = freq
+            word_freqs = new_word_freqs
+            if (merge_idx + 1) % 100 == 0:
+                print(f"  Merge {merge_idx + 1}/{num_merges}: '{best_pair[0]}' + '{best_pair[1]}' -> '{new_token}' (count={best_count})")
+        tokenizer.merges = merges
+        print(f"Final vocab size: {len(tokenizer.token_to_id)}")
+        # Save if requested
+        if output_path:
+            tokenizer.save_vocab(output_path)
+        return tokenizer
+    def apply_bpe(self, text: str) -> List[str]:
+        """Apply BPE merges to a text string."""
+        if text in self.token_to_id:
+            return [text]
+        # Split into characters
+        tokens = list(text)
+        # Apply merges
+        for pair in self.merges:
+            new_tokens = []
+            i = 0
+            while i < len(tokens):
+                if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i + 1] == pair[1]:
+                    new_tokens.append(pair[0] + pair[1])
+                    i += 2
+                else:
+                    new_tokens.append(tokens[i])
+                    i += 1
+            tokens = new_tokens
+        return tokens
+    def tokenize(self, wurcs: str, max_length: int = 256) -> Dict:
+        """
+        Tokenize a WURCS string.
+        Returns:
+            Dict with:
+                - tokens: List of token strings
+                - token_ids: List of token IDs
+                - residue_ids: List of residue IDs
+                - branch_depths: List of branch depths
+                - linkage_types: List of linkage types
+                - attention_mask: Attention mask
+        """
+        # Pre-tokenize
+        units = self.pre_tokenizer.pre_tokenize(wurcs)
+        tokens = []
+        token_ids = []
+        residue_ids = []
+        branch_depths = []
+        linkage_types = []
+        for unit in units:
+            if unit['type'] == 'special':
+                # Special tokens stay as-is
+                tok = unit['text']
+                tokens.append(tok)
+                token_ids.append(self.token_to_id.get(tok, self.token_to_id['[UNK]']))
+                residue_ids.append(unit['residue_id'])
+                branch_depths.append(unit['branch_depth'])
+                linkage_types.append(unit['linkage_type'])
+            else:
+                # Apply BPE to this unit
+                bpe_tokens = self.apply_bpe(unit['text'])
+                for tok in bpe_tokens:
+                    tokens.append(tok)
+                    token_ids.append(self.token_to_id.get(tok, self.token_to_id['[UNK]']))
+                    residue_ids.append(unit['residue_id'])
+                    branch_depths.append(unit['branch_depth'])
+                    linkage_types.append(unit['linkage_type'])
+        # Truncate if needed
+        if len(tokens) > max_length:
+            tokens = tokens[:max_length - 1] + ['[END]']
+            token_ids = token_ids[:max_length - 1] + [self.token_to_id['[END]']]
+            residue_ids = residue_ids[:max_length - 1] + [-1]
+            branch_depths = branch_depths[:max_length - 1] + [0]
+            linkage_types = linkage_types[:max_length - 1] + [0]
+        # Create attention mask and pad
+        length = len(tokens)
+        attention_mask = [1] * length
+        while len(tokens) < max_length:
+            tokens.append('[PAD]')
+            token_ids.append(self.token_to_id['[PAD]'])
+            residue_ids.append(-1)
+            branch_depths.append(0)
+            linkage_types.append(0)
+            attention_mask.append(0)
+        # Pre-tokenize
+        units = self.pre_tokenizer.pre_tokenize(wurcs)
+        # Extract distance matrix from pre-tokenizer result
+        dist_matrix_raw = units[0].get('distance_matrix', [])
+        num_residues = len(dist_matrix_raw)
+        # Map token-to-token distances using residue_ids
+        # token_i is associated with residue_ids[i].
+        # residue_ids[i] is index into dist_matrix_raw.
+        # If residue_ids[i] == -1 (special), distance is undefined (use -1 or 999)
+        # Use UNPADDED length for distance matrix to save massive memory
+        # distance_matrix will be e.g. 20x20, while tokens are padded to 256
+        token_len = length
+        distance_matrix = [[-1] * token_len for _ in range(token_len)]
+        for i in range(token_len):
+            for j in range(token_len):
+                r_i = residue_ids[i]
+                r_j = residue_ids[j]
+                if r_i >= 0 and r_j >= 0 and r_i < num_residues and r_j < num_residues:
+                    distance_matrix[i][j] = dist_matrix_raw[r_i][r_j]
+                else:
+                    distance_matrix[i][j] = -1 # Special/Padding
+        # MEMORY OPTIMIZATION: Do NOT pad matrix here.
+        # Pad on-the-fly in Dataset class instead.
+        # This saves massive memory (0.2GB vs 66GB).
+        return {
+            'tokens': tokens,
+            'token_ids': token_ids,
+            'residue_ids': residue_ids,
+            'branch_depths': branch_depths,
+            'linkage_types': linkage_types,
+            'attention_mask': attention_mask,
+            'distance_matrix': distance_matrix, # New Output
+            'length': length,
+        }
+    def save_vocab(self, path: str):
+        """Save vocabulary to JSON file."""
+        data = {
+            'special_tokens': self.special_tokens,
+            'token_to_id': self.token_to_id,
+            'merges': [list(m) for m in self.merges],
+            'metadata': {
+                'vocab_size': len(self.token_to_id),
+                'num_merges': len(self.merges),
+            }
+        }
+        with open(path, 'w') as f:
+            json.dump(data, f, indent=2)
+        print(f"Saved vocabulary to {path}")
+    def load_vocab(self, path: str):
+        """Load vocabulary from JSON file."""
+        with open(path, 'r') as f:
+            data = json.load(f)
+        self.special_tokens = data['special_tokens']
+        self.token_to_id = data['token_to_id']
+        self.id_to_token = {int(v): k for k, v in self.token_to_id.items()}
+        self.merges = [tuple(m) for m in data['merges']]
+        print(f"Loaded vocabulary with {len(self.token_to_id)} tokens")
+    @property
+    def vocab_size(self) -> int:
+        return len(self.token_to_id)
+# ============================================================================
+# Testing
+# ============================================================================
+if __name__ == '__main__':
+    # Test pre-tokenizer
+    print("="*80)
+    print("Testing WURCSPreTokenizer")
+    print("="*80)
+    pre_tok = WURCSPreTokenizer()
+    test_wurcs = [
+        "WURCS=2.0/2,2,1/[a2122h-1b_1-5][a2211m-1a_1-5]/1-2/a4-b1",
+        "WURCS=2.0/3,3,2/[a2122h-1b_1-5_2*NCC/3=O][a2112h-1a_1-5][a2211m-1a_1-5]/1-2-3/a4-b1_b3-c1",
+    ]
+    for wurcs in test_wurcs:
+        print(f"\nWURCS: {wurcs[:60]}...")
+        units = pre_tok.pre_tokenize(wurcs)
+        print(f"Units ({len(units)}):")
+        for u in units[:10]:
+            print(f"  {u['type']:10} | res={u['residue_id']:2} | depth={u['branch_depth']} | {u['text']}")
+        if len(units) > 10:
+            print(f"  ... and {len(units) - 10} more")

vocab/bpe_ambiguity_tokens.json ADDED Viewed

	@@ -0,0 +1,721 @@

+{
+  "ambiguous_tokens": {
+    "?": 32,
+    "?|": 90,
+    "a?|": 108,
+    "a?|b": 109,
+    "?|c": 110,
+    "a?|b?|c": 111,
+    "?|d": 112,
+    "a?|b?|c?|d": 113,
+    "?|e": 114,
+    "a?|b?|c?|d?|e": 115,
+    "?|f": 116,
+    "a?|b?|c?|d?|e?|f": 117,
+    "?-": 118,
+    "?|g": 119,
+    "a?|b?|c?|d?|e?|f?|g": 120,
+    "?|h": 122,
+    "?|i": 123,
+    "?|h?|i": 124,
+    "?|j": 125,
+    "?|h?|i?|j": 126,
+    "?|k": 128,
+    "?|h?|i?|j?|k": 129,
+    "?|l": 130,
+    "?|h?|i?|j?|k?|l": 131,
+    "?|m": 132,
+    "?|h?|i?|j?|k?|l?|m": 133,
+    "?|h?|i?|j?|k?|l?|m?|": 138,
+    "n?|": 141,
+    "n?|o": 142,
+    "?}": 143,
+    "n?|o?|": 146,
+    "n?|o?|p": 147,
+    "?}-": 149,
+    "?}-{": 150,
+    "n?|o?|p?|": 153,
+    "n?|o?|p?|q": 154,
+    "n?|o?|p?|q?|": 157,
+    "n?|o?|p?|q?|r": 158,
+    "n?|o?|p?|q?|r?|": 165,
+    "n?|o?|p?|q?|r?|s": 166,
+    "n?|o?|p?|q?|r?|s?|": 170,
+    "n?|o?|p?|q?|r?|s?|t": 171,
+    "?|u": 189,
+    "a?-": 197,
+    "c?-": 201,
+    "?|u?|": 209,
+    "?|u?|v": 210,
+    "b?-": 211,
+    "a?-b1": 213,
+    "d?-": 217,
+    "b?-c1": 221,
+    "c?-d1": 230,
+    "?|u?|v?|": 231,
+    "?|u?|v?|w": 232,
+    "1-?": 242,
+    "d?-e1": 244,
+    "e?-": 245,
+    "e?-f1": 262,
+    "?|u?|v?|w?|": 266,
+    "?|u?|v?|w?|x": 267,
+    "f?-": 273,
+    "?|u?|v?|w?|x?|": 288,
+    "?|u?|v?|w?|x?|y": 289,
+    "g?-": 298,
+    "n?|o?|p?|q?|r?|s?|t?": 304,
+    "i?-": 306,
+    "h?-": 308,
+    "?|u?}-{": 312,
+    "?|u?": 313,
+    "n?|o?|p?|q?|r?}-{": 314,
+    "n?|o?|p?|q?|r?": 315,
+    "f?-g1": 318,
+    "n?|o?}-{": 322,
+    "n?|o?": 323,
+    "?|u?|v?|w?|x?|y?|": 325,
+    "?|u?|v?|w?|x?|y?|z": 326,
+    "n?|o?|p?|q?|r?|s?}-{": 328,
+    "n?|o?|p?|q?|r?|s?": 329,
+    "n?|o?|p?}-{": 331,
+    "n?|o?|p?": 332,
+    "n?|o?|p?|q?}-{": 336,
+    "n?|o?|p?|q?": 337,
+    "g?-h1": 339,
+    "?|h?|i?|j?|k?|l?}-{": 342,
+    "?|h?|i?|j?|k?|l?": 343,
+    "n?}-{": 344,
+    "n?": 345,
+    "j?-": 346,
+    "h?-i1": 347,
+    "?|u?|v?}-{": 351,
+    "?|u?|v?": 352,
+    "k?-": 353,
+    "i?-j1": 355,
+    "?|h?|i?|j?|k?|l?|m?": 363,
+    "?|h?|i?|j?|k?}-{": 364,
+    "?|h?|i?|j?|k?": 365,
+    "?|u?|v?|w?|x?|y?|z?|": 369,
+    "?|h?|i?|j?}-{": 375,
+    "?|h?|i?|j?": 376,
+    "l?-": 377,
+    "A?|": 392,
+    "A?|B": 393,
+    "j?-k1": 401,
+    "m?-": 404,
+    "?|h?|i?}-{": 408,
+    "?|h?|i?": 409,
+    "k?-l1": 418,
+    "?|h?}-{": 420,
+    "?|h?": 421,
+    "A?|B?|": 424,
+    "A?|B?|C": 425,
+    "a?|b?|c?|d?|e?|f?|g?": 427,
+    "f?-g2": 431,
+    "a?|b?|c?|d?|e?|f?}-{": 437,
+    "a?|b?|c?|d?|e?|f?": 438,
+    "l?-m1": 442,
+    "?|u?|v?|w?}-{": 450,
+    "?|u?|v?|w?": 451,
+    "A?|B?|C?|": 464,
+    "A?|B?|C?|D": 465,
+    "a?|b?|c?|d?|e?}-{": 475,
+    "a?|b?|c?|d?|e?": 476,
+    "n?-": 499,
+    "a?|b?|c?|d?}-{": 502,
+    "a?|b?|c?|d?": 503,
+    "m?-n1": 518,
+    "A?|B?|C?|D?|": 521,
+    "A?|B?|C?|D?|E": 522,
+    "o?-": 534,
+    "d?-h1": 536,
+    "A?|B?|C?|D?|E?|": 542,
+    "A?|B?|C?|D?|E?|F": 543,
+    "c?-i1": 544,
+    "c?-h1": 549,
+    "A?|B?|C?|D?|E?|F?|": 550,
+    "A?|B?|C?|D?|E?|F?|G": 551,
+    "?|u?|v?|w?|x?}-{": 563,
+    "?|u?|v?|w?|x?": 564,
+    "a?|b?|c?}-{": 571,
+    "a?|b?|c?}-{a?|b?|c": 572,
+    "a?|b?|c?}-{a?|b?|c?": 573,
+    "?|H": 581,
+    "2-?": 592,
+    "?|H?|": 598,
+    "?|H?|I": 599,
+    "?}*OC": 600,
+    "c?-k1": 607,
+    "c?-g1": 609,
+    "?|H?|I?|": 615,
+    "?|H?|I?|J": 616,
+    "n?-o1": 617,
+    "d?-g1": 629,
+    "o?-p1": 634,
+    "p?-": 646,
+    "?|u?|v?|w?|x?|y?}-{": 653,
+    "?|u?|v?|w?|x?|y?": 654,
+    "b?-c2": 656,
+    "d?-i1": 658,
+    "c?-j1": 691,
+    "?}*OSO": 696,
+    "e?-h1": 701,
+    "q?-": 713,
+    "c?-f1": 720,
+    "i?-j2": 728,
+    "?|h?|i?}": 742,
+    "h?-i2": 747,
+    "g?-h2": 753,
+    "c?-l1": 756,
+    "j?-k2": 758,
+    "?|h?}": 759,
+    "c?-e1": 760,
+    "?|H?|I?|J?|": 761,
+    "?|H?|I?|J?|K": 762,
+    "a?|b?|c?|d?|e?|f?}": 772,
+    "b?-e1": 774,
+    "b?-f1": 791,
+    "d?-f1": 794,
+    "p?-q1": 796,
+    "a?|b?|c?|d?|e?}": 798,
+    "a?-d1": 800,
+    "m?-n2": 803,
+    "e?-g1": 809,
+    "?|h?|i?|j?}": 812,
+    "r?-": 817,
+    "a?-c1": 818,
+    "?|u?|v?|w?|x?|y?|z?": 822,
+    "a?-e1": 826,
+    "d?-j1": 833,
+    "b?-g1": 834,
+    "q?-r1": 847,
+    "d?-e2": 854,
+    "c?-m1": 860,
+    "a?-f1": 875,
+    "b?-d1": 887,
+    "?|H?|I?|J?|K?|": 892,
+    "?|H?|I?|J?|K?|L": 893,
+    "?|H?|I?|J?|K?|L?|": 894,
+    "?|H?|I?|J?|K?|L?|M": 895,
+    "?|H?|I?|J?|K?|L?|M?|": 896,
+    "a?-l1": 920,
+    "?*OSO/3=O/3=O": 923,
+    "k?-l2": 940,
+    "k?-o1": 942,
+    "N?|": 965,
+    "N?|O": 966,
+    "N?|O?|": 967,
+    "N?|O?|P": 968,
+    "N?|O?|P?|": 969,
+    "N?|O?|P?|Q": 970,
+    "N?|O?|P?|Q?|": 971,
+    "N?|O?|P?|Q?|R": 972,
+    "N?|O?|P?|Q?|R?|": 973,
+    "N?|O?|P?|Q?|R?|S": 974,
+    "N?|O?|P?|Q?|R?|S?|": 975,
+    "N?|O?|P?|Q?|R?|S?|T": 976,
+    "?|U": 977,
+    "?|U?|": 978,
+    "?|U?|V": 979,
+    "c?-d2": 983,
+    "r?-s1": 988,
+    "a?|b?}-{": 995,
+    "a?|b?}-{a?|b": 996,
+    "a?|b?}-{a?|b?": 997,
+    "e?-f2": 1001,
+    "g?-i1": 1006,
+    "i?-l1": 1010,
+    "s?-": 1011,
+    "?|h?|i?|j?|k?}": 1017,
+    "b?-h1": 1034,
+    "a?-j1": 1038,
+    "n?-o2": 1046,
+    "a?-b2": 1069,
+    "e?-i1": 1095,
+    "h?-j1": 1102,
+    "a?-k1": 1108,
+    "i?-k1": 1115,
+    "a?-g1": 1116,
+    "?}*OPO": 1122,
+    "d?-k1": 1129,
+    "a?-m1": 1151,
+    "a?-i1": 1159,
+    "A?}-{": 1174,
+    "A?": 1175,
+    "?}*OCC": 1177,
+    "l?-m2": 1179,
+    "A?|B?}-{": 1180,
+    "A?|B?": 1181,
+    "f?-h1": 1183,
+    "a?-n1": 1189,
+    "p?-q2": 1192,
+    "c?-n1": 1197,
+    "?|U?|V?|": 1202,
+    "?|U?|V?|W": 1203,
+    "?|U?|V?|W?|": 1204,
+    "?|U?|V?|W?|X": 1205,
+    "?|U?|V?|W?|X?|": 1206,
+    "?|U?|V?|W?|X?|Y": 1207,
+    "?|a": 1208,
+    "s?-t1": 1223,
+    "?|h?|i?|j?|k?|l?|m?}": 1228,
+    "g?-j1": 1234,
+    "A?|B?|C?|D?}-{": 1242,
+    "A?|B?|C?|D?": 1243,
+    "a?-h1": 1253,
+    "?|H?|I?|J?}-{": 1257,
+    "?|H?|I?|J?": 1258,
+    "o?-p2": 1261,
+    "b?-i1": 1273,
+    "?|h?|i?|j?|k?|l?}": 1309,
+    "j?-m1": 1317,
+    "c?-o1": 1318,
+    "a?-o1": 1330,
+    "a?|b?|c?}*OC": 1331,
+    "b?-j1": 1357,
+    "a?-r1": 1361,
+    "n?}": 1363,
+    "A?|B?|C?}-{": 1371,
+    "A?|B?|C?": 1372,
+    "m?-p1": 1375,
+    "l?-p1": 1383,
+    "a?-p1": 1444,
+    "k?-n1": 1446,
+    "j?-l1": 1470,
+    "?|U?|V?|W?|X?|Y?|": 1471,
+    "?|U?|V?|W?|X?|Y?|Z": 1472,
+    "?|aa?|": 1473,
+    "?|aa?|a": 1474,
+    "?|aa?|ab": 1475,
+    "?*OPO/3O/3=O": 1476,
+    "l?-q1": 1489,
+    "l?-n1": 1499,
+    "a?-s1": 1517,
+    "k?-m1": 1524,
+    "a?-q1": 1546,
+    "c?-q1": 1547,
+    "t?-": 1551,
+    "a?|b?|c?|d?}*OC": 1565,
+    "f?-i1": 1590,
+    "c?-p1": 1591,
+    "n?-q1": 1593,
+    "?|i?}": 1611,
+    "a?|b?|c?|d?|e?}*OC": 1612,
+    "m?-q1": 1617,
+    "q?-r2": 1623,
+    "l?-o1": 1624,
+    "m?-r1": 1628,
+    "a?-t1": 1630,
+    "a?|b?|c?|d?}*OSO": 1649,
+    "c?-r1": 1675,
+    "1-d?|i?}": 1683,
+    "j?-n1": 1691,
+    "u?-": 1694,
+    "a?|b?|c?|d?|e?}*OSO": 1718,
+    "?*OCC/3=O": 1723,
+    "?%": 1752,
+    "?*OP^XOCCN/3O/3=O": 1770,
+    "t?-u1": 1772,
+    "?*": 1774,
+    "c?-s1": 1775,
+    "a?-u1": 1793,
+    "f?-h2": 1808,
+    "e?-j1": 1811,
+    "c?-t1": 1818,
+    "f1-a?|b?|c?|d?|e?}": 1822,
+    "u?-v1": 1835,
+    "h?-k1": 1841,
+    "?|H?|I?|J?|K?}-{": 1846,
+    "?|H?|I?|J?|K?": 1847,
+    "n?|o?}": 1851,
+    "1-d?|h?}": 1852,
+    "q?-s1": 1872,
+    "%?%": 1880,
+    "b?-g2": 1881,
+    "r?-s2": 1882,
+    "d?-l1": 1898,
+    "v?-": 1917,
+    "b?-k1": 1927,
+    "?|aa?|ab?|a": 1942,
+    "?|aa?|ab?|ac": 1943,
+    "?|aa?|ab?|ac?|": 1944,
+    "?|aa?|ab?|ac?|ad": 1945,
+    "a?|b?}*OC": 1949,
+    "?*OC": 1952,
+    "e?-k1": 1955,
+    "a?-d2": 1999,
+    "s?-t2": 2013,
+    "a?-f2": 2027,
+    "o?-q1": 2030,
+    "?}*OP^XOCCN": 2040,
+    "a?|b?|c?}*OCC": 2047,
+    "m?-o1": 2048,
+    "c?-f2": 2058,
+    "A?|B?|C?|D?|E?|F?|G?": 2060,
+    "a?|b?|c?}*OSO": 2071,
+    "?|U?|V?}-{": 2079,
+    "?|U?|V?": 2080,
+    "c?-u1": 2087
+  },
+  "ambiguous_ids": [
+    32,
+    90,
+    108,
+    109,
+    110,
+    111,
+    112,
+    113,
+    114,
+    115,
+    116,
+    117,
+    118,
+    119,
+    120,
+    122,
+    123,
+    124,
+    125,
+    126,
+    128,
+    129,
+    130,
+    131,
+    132,
+    133,
+    138,
+    141,
+    142,
+    143,
+    146,
+    147,
+    149,
+    150,
+    153,
+    154,
+    157,
+    158,
+    165,
+    166,
+    170,
+    171,
+    189,
+    197,
+    201,
+    209,
+    210,
+    211,
+    213,
+    217,
+    221,
+    230,
+    231,
+    232,
+    242,
+    244,
+    245,
+    262,
+    266,
+    267,
+    273,
+    288,
+    289,
+    298,
+    304,
+    306,
+    308,
+    312,
+    313,
+    314,
+    315,
+    318,
+    322,
+    323,
+    325,
+    326,
+    328,
+    329,
+    331,
+    332,
+    336,
+    337,
+    339,
+    342,
+    343,
+    344,
+    345,
+    346,
+    347,
+    351,
+    352,
+    353,
+    355,
+    363,
+    364,
+    365,
+    369,
+    375,
+    376,
+    377,
+    392,
+    393,
+    401,
+    404,
+    408,
+    409,
+    418,
+    420,
+    421,
+    424,
+    425,
+    427,
+    431,
+    437,
+    438,
+    442,
+    450,
+    451,
+    464,
+    465,
+    475,
+    476,
+    499,
+    502,
+    503,
+    518,
+    521,
+    522,
+    534,
+    536,
+    542,
+    543,
+    544,
+    549,
+    550,
+    551,
+    563,
+    564,
+    571,
+    572,
+    573,
+    581,
+    592,
+    598,
+    599,
+    600,
+    607,
+    609,
+    615,
+    616,
+    617,
+    629,
+    634,
+    646,
+    653,
+    654,
+    656,
+    658,
+    691,
+    696,
+    701,
+    713,
+    720,
+    728,
+    742,
+    747,
+    753,
+    756,
+    758,
+    759,
+    760,
+    761,
+    762,
+    772,
+    774,
+    791,
+    794,
+    796,
+    798,
+    800,
+    803,
+    809,
+    812,
+    817,
+    818,
+    822,
+    826,
+    833,
+    834,
+    847,
+    854,
+    860,
+    875,
+    887,
+    892,
+    893,
+    894,
+    895,
+    896,
+    920,
+    923,
+    940,
+    942,
+    965,
+    966,
+    967,
+    968,
+    969,
+    970,
+    971,
+    972,
+    973,
+    974,
+    975,
+    976,
+    977,
+    978,
+    979,
+    983,
+    988,
+    995,
+    996,
+    997,
+    1001,
+    1006,
+    1010,
+    1011,
+    1017,
+    1034,
+    1038,
+    1046,
+    1069,
+    1095,
+    1102,
+    1108,
+    1115,
+    1116,
+    1122,
+    1129,
+    1151,
+    1159,
+    1174,
+    1175,
+    1177,
+    1179,
+    1180,
+    1181,
+    1183,
+    1189,
+    1192,
+    1197,
+    1202,
+    1203,
+    1204,
+    1205,
+    1206,
+    1207,
+    1208,
+    1223,
+    1228,
+    1234,
+    1242,
+    1243,
+    1253,
+    1257,
+    1258,
+    1261,
+    1273,
+    1309,
+    1317,
+    1318,
+    1330,
+    1331,
+    1357,
+    1361,
+    1363,
+    1371,
+    1372,
+    1375,
+    1383,
+    1444,
+    1446,
+    1470,
+    1471,
+    1472,
+    1473,
+    1474,
+    1475,
+    1476,
+    1489,
+    1499,
+    1517,
+    1524,
+    1546,
+    1547,
+    1551,
+    1565,
+    1590,
+    1591,
+    1593,
+    1611,
+    1612,
+    1617,
+    1623,
+    1624,
+    1628,
+    1630,
+    1649,
+    1675,
+    1683,
+    1691,
+    1694,
+    1718,
+    1723,
+    1752,
+    1770,
+    1772,
+    1774,
+    1775,
+    1793,
+    1808,
+    1811,
+    1818,
+    1822,
+    1835,
+    1841,
+    1846,
+    1847,
+    1851,
+    1852,
+    1872,
+    1880,
+    1881,
+    1882,
+    1898,
+    1917,
+    1927,
+    1942,
+    1943,
+    1944,
+    1945,
+    1949,
+    1952,
+    1955,
+    1999,
+    2013,
+    2027,
+    2030,
+    2040,
+    2047,
+    2048,
+    2058,
+    2060,
+    2071,
+    2079,
+    2080,
+    2087
+  ],
+  "source_vocab": "data/bpe_vocabulary_clean.json"
+}

vocab/bpe_vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff