bertose-glycan-encoder / src /bertose_model.py

Apply BERTose display capitalization

01c1153 verified 5 days ago

44.8 kB

	"""
	BERTose model

	Core glycan representation model with three modalities:
	- Sequence (WURCS atomic tokenization)
	- MS (mass spectrometry peaks, RT, intensity)
	- 3D structure (VQ-VAE discrete tokens, 4 per residue)

	Each modality has its own encoder, with cross-attention for sequence-structure alignment.
	"""

	import torch
	import torch.nn as nn
	from typing import Dict, Optional, Tuple
	import math

	try:
	from .bertose_layers import GlycanBERTConfig, GlycanBERTEmbeddings, GlycanBERTLayer
	except ImportError:
	from bertose_layers import GlycanBERTConfig, GlycanBERTEmbeddings, GlycanBERTLayer


	class ConvGlycanBERTEmbeddings(nn.Module):
	"""
	Improved Convolutional front-end that mixes local WURCS context before the Transformer.

	Key improvements over original:
	1. Position embeddings added BEFORE convolution (provides spatial context to conv)
	2. Residual connection (conv enriches embeddings rather than replacing them)
	3. Multi-scale convolutions (kernel sizes 3, 5, 7) for better receptive field
	4. Proper layer normalization on the residual path
	"""

	def __init__(self, config):
	super().__init__()
	self.token_embeddings = nn.Embedding(
	config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
	)
	self.position_embeddings = nn.Embedding(
	config.max_position_embeddings, config.hidden_size
	)

	# Branch depth embeddings encode depth in the glycan tree.
	max_branch_depth = getattr(config, "max_branch_depth", 8)
	self.branch_embeddings = nn.Embedding(max_branch_depth, config.hidden_size)

	# Linkage type embeddings encode glycosidic bond chemistry.
	# 0=none, 1=1-3, 2=1-4, 3=1-6, etc.
	num_linkage_types = getattr(config, "num_linkage_types", 9)
	self.linkage_embeddings = nn.Embedding(num_linkage_types, config.hidden_size)

	# Multi-scale convolutions for different receptive fields
	kernel_size = getattr(config, "cnn_kernel_size", 3)
	# Split channels evenly: 256 + 256 + 256 = 768 for hidden_size=768
	channels_per_scale = config.hidden_size // 3
	self.conv_layers = nn.ModuleList([
	nn.Conv1d(
	in_channels=config.hidden_size,
	out_channels=channels_per_scale,
	kernel_size=kernel_size + 2 * i, # Kernels: 3, 5, 7
	padding=(kernel_size + 2 * i) // 2, # Same padding
	)
	for i in range(3)
	])
	self.conv_activation = nn.GELU()
	self.conv_proj = nn.Linear(channels_per_scale * 3, config.hidden_size) # Project concatenated back

	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.conv_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)
	self.register_buffer(
	"position_ids",
	torch.arange(config.max_position_embeddings).expand((1, -1)),
	)

	self.hidden_size = config.hidden_size

	def forward(self, input_ids, branch_depths=None, linkage_types=None):
	seq_len = input_ids.shape[1]

	# Step 1: Token + Position embeddings FIRST (provides spatial context to conv)
	x = self.token_embeddings(input_ids) # (batch, seq, hidden)
	position_ids = self.position_ids[:, :seq_len]
	x = x + self.position_embeddings(position_ids)

	# Add branch depth embeddings.
	if branch_depths is not None:
	# Clamp to valid range
	branch_depths = branch_depths.clamp(0, self.branch_embeddings.num_embeddings - 1)
	x = x + self.branch_embeddings(branch_depths)

	# Add linkage type embeddings.
	if linkage_types is not None:
	linkage_types = linkage_types.clamp(0, self.linkage_embeddings.num_embeddings - 1)
	x = x + self.linkage_embeddings(linkage_types)

	x = self.LayerNorm(x)

	# Step 2: Multi-scale convolution with RESIDUAL connection
	# Convolution expects (batch, hidden, seq)
	conv_in = x.permute(0, 2, 1)

	# Apply multi-scale convolutions and concatenate
	conv_outputs = []
	for conv in self.conv_layers:
	conv_out = self.conv_activation(conv(conv_in))
	conv_outputs.append(conv_out)

	# Concatenate multi-scale features and project back
	conv_out = torch.cat(conv_outputs, dim=1) # (batch, hidden, seq)
	conv_out = conv_out.permute(0, 2, 1) # (batch, seq, hidden)
	conv_out = self.conv_proj(conv_out) # Project to correct size

	# Step 3: Residual connection - conv ENRICHES rather than replaces
	x = self.conv_norm(x + self.dropout(conv_out))

	return x


	def create_residue_level_mask(
	seq_residue_ids: torch.Tensor, # (batch, N_seq)
	struct_residue_ids: torch.Tensor # (batch, N_struct)
	) -> torch.Tensor:
	"""
	Create residue-level attention mask for cross-attention.

	Maps WURCS tokens to VQ-VAE structural tokens based on residue IDs.
	A WURCS token with residue_id=0 can only attend to VQ-VAE tokens with residue_id=0.

	Args:
	seq_residue_ids: Residue IDs for sequence tokens (batch, N_seq)
	struct_residue_ids: Residue IDs for structural tokens (batch, N_struct)

	Returns:
	Boolean mask (batch, N_seq, N_struct) where True = can attend
	"""
	# Expand dimensions for broadcasting
	# seq: (batch, N_seq, 1)
	# struct: (batch, 1, N_struct)
	mask = seq_residue_ids.unsqueeze(2) == struct_residue_ids.unsqueeze(1)
	# Shape: (batch, N_seq, N_struct)

	# Mask out structural tokens (residue_id = -1) and MS tokens (residue_id = -2)
	# Only tokens with residue_id >= 0 can attend
	mask &= (seq_residue_ids.unsqueeze(2) >= 0)

	return mask # True = can attend, False = cannot attend


	class MultimodalGlycanBERTConfig:
	"""Configuration for the BERTose model."""

	def __init__(
	self,
	# Sequence modality
	seq_vocab_size: int = 166,
	seq_hidden_size: int = 768,
	seq_num_layers: int = 12,
	seq_num_heads: int = 12,
	seq_max_length: int = 512,

	# MS modality
	ms_vocab_size: int = 242,
	ms_hidden_size: int = 384,
	ms_num_layers: int = 6,
	ms_num_heads: int = 6,
	ms_max_length: int = 150,

	# 3D structure modality
	struct_vocab_size: int = 1024, # VQ-VAE codebook size
	struct_hidden_size: int = 512,
	struct_num_layers: int = 8,
	struct_num_heads: int = 8,
	struct_max_length: int = 200,
	use_3d: bool = True,

	# Cross-attention
	use_cross_attention: bool = True,
	cross_attn_num_heads: int = 8,

	# Fusion
	fusion_hidden_size: int = 768,
	fusion_num_layers: int = 2,

	# Training
	hidden_dropout_prob: float = 0.1,
	attention_probs_dropout_prob: float = 0.1,
	layer_norm_eps: float = 1e-12,
	initializer_range: float = 0.02,

	# Conv front-end
	use_cnn_frontend: bool = True,
	cnn_kernel_size: int = 3,

	# Loss weights
	seq_loss_weight: float = 0.60,
	ms_loss_weight: float = 0.15,
	struct_loss_weight: float = 0.25,

	# Token IDs
	pad_token_id: int = 0,
	mask_token_id: int = 1,
	):
	# Sequence config
	self.seq_vocab_size = seq_vocab_size
	self.seq_hidden_size = seq_hidden_size
	self.seq_num_layers = seq_num_layers
	self.seq_num_heads = seq_num_heads
	self.seq_max_length = seq_max_length

	# MS config
	self.ms_vocab_size = ms_vocab_size
	self.ms_vocab_offset = seq_vocab_size # MS tokens start at 166
	self.ms_total_vocab_size = seq_vocab_size + ms_vocab_size # 408 total
	self.ms_hidden_size = ms_hidden_size
	self.ms_num_layers = ms_num_layers
	self.ms_num_heads = ms_num_heads
	self.ms_max_length = ms_max_length

	# Structure config
	self.struct_vocab_size = struct_vocab_size
	self.struct_hidden_size = struct_hidden_size
	self.struct_num_layers = struct_num_layers
	self.struct_num_heads = struct_num_heads
	self.struct_max_length = struct_max_length
	self.use_3d = use_3d

	# Cross-attention config
	self.use_cross_attention = use_cross_attention
	self.cross_attn_num_heads = cross_attn_num_heads

	# Fusion config
	self.fusion_hidden_size = fusion_hidden_size
	self.fusion_num_layers = fusion_num_layers

	# Training config
	self.hidden_dropout_prob = hidden_dropout_prob
	self.attention_probs_dropout_prob = attention_probs_dropout_prob
	self.layer_norm_eps = layer_norm_eps
	self.initializer_range = initializer_range

	# Conv front-end
	self.use_cnn_frontend = use_cnn_frontend
	self.cnn_kernel_size = cnn_kernel_size

	# Loss weights
	self.seq_loss_weight = seq_loss_weight
	self.ms_loss_weight = ms_loss_weight
	self.struct_loss_weight = struct_loss_weight
	self.dist_loss_weight = 0.25

	# Token IDs
	self.pad_token_id = pad_token_id
	self.mask_token_id = mask_token_id

	def to_seq_config(self) -> GlycanBERTConfig:
	"""Convert to sequence-only config."""
	return GlycanBERTConfig(
	vocab_size=self.seq_vocab_size,
	hidden_size=self.seq_hidden_size,
	num_hidden_layers=self.seq_num_layers,
	num_attention_heads=self.seq_num_heads,
	intermediate_size=self.seq_hidden_size * 4,
	hidden_dropout_prob=self.hidden_dropout_prob,
	attention_probs_dropout_prob=self.attention_probs_dropout_prob,
	max_position_embeddings=self.seq_max_length,
	layer_norm_eps=self.layer_norm_eps,
	pad_token_id=self.pad_token_id,
	mask_token_id=self.mask_token_id,
	initializer_range=self.initializer_range,
	)

	def to_ms_config(self) -> GlycanBERTConfig:
	"""Convert to MS-only config."""
	return GlycanBERTConfig(
	vocab_size=self.ms_total_vocab_size,
	hidden_size=self.ms_hidden_size,
	num_hidden_layers=self.ms_num_layers,
	num_attention_heads=self.ms_num_heads,
	intermediate_size=self.ms_hidden_size * 4,
	hidden_dropout_prob=self.hidden_dropout_prob,
	attention_probs_dropout_prob=self.attention_probs_dropout_prob,
	max_position_embeddings=self.ms_max_length,
	layer_norm_eps=self.layer_norm_eps,
	pad_token_id=self.pad_token_id,
	mask_token_id=self.mask_token_id,
	initializer_range=self.initializer_range,
	)

	def to_struct_config(self) -> GlycanBERTConfig:
	"""Convert to structure-only config."""
	return GlycanBERTConfig(
	vocab_size=self.struct_vocab_size,
	hidden_size=self.struct_hidden_size,
	num_hidden_layers=self.struct_num_layers,
	num_attention_heads=self.struct_num_heads,
	intermediate_size=self.struct_hidden_size * 4,
	hidden_dropout_prob=self.hidden_dropout_prob,
	attention_probs_dropout_prob=self.attention_probs_dropout_prob,
	max_position_embeddings=self.struct_max_length,
	layer_norm_eps=self.layer_norm_eps,
	pad_token_id=self.pad_token_id,
	mask_token_id=self.mask_token_id,
	initializer_range=self.initializer_range,
	)


	# =============================================================================
	# Improvement #1: Monosaccharide-Level Pooling
	# =============================================================================

	class MonosaccharidePooling(nn.Module):
	"""
	Pool token representations to monosaccharide level, then aggregate.

	This bridges the gap between token-level BERT and monosaccharide-level CNNs/GNNs.
	Uses monosaccharide_indices from the data to know where each residue starts.
	"""

	def __init__(self, hidden_size: int, num_attention_heads: int = 8, dropout: float = 0.1):
	super().__init__()
	self.hidden_size = hidden_size

	# Attention pooling over monosaccharide representations
	self.mono_attention = nn.MultiheadAttention(
	embed_dim=hidden_size,
	num_heads=num_attention_heads,
	dropout=dropout,
	batch_first=True
	)
	self.mono_norm = nn.LayerNorm(hidden_size)

	# Final aggregation to single glycan representation
	self.glycan_query = nn.Parameter(torch.randn(1, 1, hidden_size) * 0.02)
	self.glycan_attention = nn.MultiheadAttention(
	embed_dim=hidden_size,
	num_heads=num_attention_heads,
	dropout=dropout,
	batch_first=True
	)
	self.glycan_norm = nn.LayerNorm(hidden_size)

	def forward(
	self,
	hidden_states: torch.Tensor, # (batch, seq_len, hidden)
	residue_ids: torch.Tensor, # (batch, seq_len) - which residue each token belongs to
	attention_mask: torch.Tensor = None, # (batch, seq_len)
	) -> torch.Tensor:
	"""
	Pool tokens to monosaccharide level, then to glycan level.

	Returns:
	Glycan representation: (batch, hidden_size)
	"""
	batch_size = hidden_states.size(0)
	device = hidden_states.device

	# Get unique residue IDs per sample (excluding -1 padding)
	max_residues = 50 # Reasonable max for glycans

	# Pool tokens within each residue using mean pooling
	mono_reps = torch.zeros(batch_size, max_residues, self.hidden_size, device=device)
	mono_mask = torch.zeros(batch_size, max_residues, dtype=torch.bool, device=device)

	for b in range(batch_size):
	unique_residues = torch.unique(residue_ids[b][residue_ids[b] >= 0])
	for i, rid in enumerate(unique_residues):
	if i >= max_residues:
	break
	token_mask = residue_ids[b] == rid
	if attention_mask is not None:
	token_mask = token_mask & (attention_mask[b] > 0)
	if token_mask.sum() > 0:
	mono_reps[b, i] = hidden_states[b][token_mask].mean(dim=0)
	mono_mask[b, i] = True

	# Apply attention over monosaccharide representations
	# Convert mask for attention: True = valid, need to invert for PyTorch
	key_padding_mask = ~mono_mask # True = ignore

	mono_out, _ = self.mono_attention(
	mono_reps, mono_reps, mono_reps,
	key_padding_mask=key_padding_mask
	)
	mono_out = self.mono_norm(mono_reps + mono_out)

	# Aggregate to single glycan representation using learned query
	glycan_query = self.glycan_query.expand(batch_size, -1, -1)
	glycan_out, _ = self.glycan_attention(
	glycan_query, mono_out, mono_out,
	key_padding_mask=key_padding_mask
	)
	glycan_out = self.glycan_norm(glycan_query + glycan_out)

	return glycan_out.squeeze(1) # (batch, hidden)


	# =============================================================================
	# Improvement #2: Residue Type Embeddings
	# =============================================================================

	# Common monosaccharide types vocabulary
	MONOSACCHARIDE_VOCAB = {
	'[PAD_MONO]': 0, '[UNK_MONO]': 1,
	'Glc': 2, 'GlcNAc': 3, 'GlcA': 4, 'GlcN': 5,
	'Gal': 6, 'GalNAc': 7, 'GalA': 8, 'GalN': 9,
	'Man': 10, 'ManNAc': 11, 'ManA': 12, 'ManN': 13,
	'Fuc': 14, 'Rha': 15, 'Xyl': 16, 'Ara': 17,
	'Neu5Ac': 18, 'Neu5Gc': 19, 'Kdn': 20, 'Sia': 21,
	'GalNAcA': 22, 'GlcNAcA': 23, 'IdoA': 24, 'GulA': 25,
	'Rib': 26, 'Lyx': 27, 'All': 28, 'Alt': 29,
	'Tal': 30, 'Ido': 31, 'Qui': 32, 'Oli': 33,
	'Tyv': 34, 'Abe': 35, 'Par': 36, 'Dig': 37,
	'Col': 38, 'Dha': 39, 'Kdo': 40, 'Hep': 41,
	'NeuroGc': 42, 'Muramic': 43, 'LDManHep': 44, 'DDManHep': 45,
	'Bac': 46, 'Pse': 47, 'Leg': 48, 'Aci': 49,
	'6dTal': 50, 'Fru': 51, 'Tag': 52, 'Sor': 53,
	'Psi': 54, 'Sed': 55, 'MurNAc': 56, 'MurNGc': 57,
	'Api': 58, 'Erwiniose': 59, 'Yer': 60, 'Thre': 61,
	# Add more as needed, up to ~70
	}


	class ResidueTypeEmbeddings(nn.Module):
	"""
	Learnable embeddings for monosaccharide types.

	Instead of the model having to learn that 'a1221m' = Fucose from character patterns,
	we explicitly add a Fucose embedding to all tokens belonging to that residue.
	"""

	def __init__(self, hidden_size: int, num_mono_types: int = 70):
	super().__init__()
	self.mono_embeddings = nn.Embedding(num_mono_types, hidden_size)
	self.mono_vocab = MONOSACCHARIDE_VOCAB
	self.hidden_size = hidden_size

	def forward(
	self,
	token_embeddings: torch.Tensor, # (batch, seq_len, hidden)
	residue_ids: torch.Tensor, # (batch, seq_len)
	mono_type_ids: torch.Tensor = None, # (batch, max_residues) - monosaccharide type per residue
	) -> torch.Tensor:
	"""
	Add residue type embeddings to token embeddings.

	Args:
	token_embeddings: Base token embeddings
	residue_ids: Which residue each token belongs to (-1 for special tokens)
	mono_type_ids: Monosaccharide type ID for each residue position

	Returns:
	Enhanced embeddings with residue type information
	"""
	if mono_type_ids is None:
	return token_embeddings

	batch_size, seq_len, _ = token_embeddings.shape
	enhanced = token_embeddings.clone()

	# Add mono type embedding to each token based on its residue
	for b in range(batch_size):
	for pos in range(seq_len):
	rid = residue_ids[b, pos].item()
	if rid >= 0 and rid < mono_type_ids.size(1):
	mono_id = mono_type_ids[b, rid]
	enhanced[b, pos] = enhanced[b, pos] + self.mono_embeddings(mono_id)

	return enhanced

	@staticmethod
	def get_mono_type_id(mono_name: str) -> int:
	"""Convert monosaccharide name to type ID."""
	return MONOSACCHARIDE_VOCAB.get(mono_name, MONOSACCHARIDE_VOCAB['[UNK_MONO]'])


	# =============================================================================
	# Improvement #4: Relative Position Encoding for Glycan Trees
	# =============================================================================

	class RelativePositionBias(nn.Module):
	"""
	Compute relative position bias for attention based on residue IDs.

	Tokens in the same residue get distance 0.
	Tokens in adjacent residues get distance ±1.
	This helps the model understand glycan tree structure.
	"""

	def __init__(self, num_heads: int, max_distance: int = 10):
	super().__init__()
	self.num_heads = num_heads
	self.max_distance = max_distance

	# Learnable bias for each relative distance (-max to +max)
	num_distances = 2 * max_distance + 1
	self.relative_bias = nn.Embedding(num_distances, num_heads)

	def forward(self, residue_ids: torch.Tensor) -> torch.Tensor:
	"""
	Compute relative position bias.

	Args:
	residue_ids: (batch, seq_len)

	Returns:
	Bias to add to attention scores: (batch, num_heads, seq_len, seq_len)
	"""
	# Compute pairwise residue distances
	# (batch, seq_len, 1) - (batch, 1, seq_len) = (batch, seq_len, seq_len)
	distance = residue_ids.unsqueeze(2) - residue_ids.unsqueeze(1)

	# Clamp to max distance range and shift to 0-indexed
	distance_clamped = distance.clamp(-self.max_distance, self.max_distance)
	distance_idx = distance_clamped + self.max_distance # Now 0 to 2*max_distance

	# Look up bias: (batch, seq_len, seq_len, num_heads)
	bias = self.relative_bias(distance_idx)

	# Transpose to (batch, num_heads, seq_len, seq_len)
	bias = bias.permute(0, 3, 1, 2)

	return bias


	class CrossAttentionLayer(nn.Module):
	"""
	Cross-attention layer for sequence-structure alignment.

	Allows sequence tokens to attend to structural atoms using attention masks.
	"""

	def __init__(self, config: MultimodalGlycanBERTConfig):
	super().__init__()
	self.num_heads = config.cross_attn_num_heads
	self.hidden_size = config.seq_hidden_size
	self.head_dim = self.hidden_size // self.num_heads

	assert self.hidden_size % self.num_heads == 0, "hidden_size must be divisible by num_heads"

	# Query from sequence, Key/Value from structure (VQ-VAE tokens)
	self.query = nn.Linear(config.seq_hidden_size, self.hidden_size)
	self.key = nn.Linear(config.struct_hidden_size, self.hidden_size)
	self.value = nn.Linear(config.struct_hidden_size, self.hidden_size)

	self.output = nn.Linear(self.hidden_size, config.seq_hidden_size)
	self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
	self.layer_norm = nn.LayerNorm(config.seq_hidden_size, eps=config.layer_norm_eps)

	def forward(
	self,
	seq_hidden: torch.Tensor, # (batch, seq_len, seq_hidden)
	struct_hidden: torch.Tensor, # (batch, struct_len, struct_hidden)
	attention_mask: Optional[torch.Tensor] = None, # (batch, seq_len, struct_len)
	) -> torch.Tensor:
	"""
	Apply cross-attention from sequence to structure.

	Args:
	seq_hidden: Sequence hidden states
	struct_hidden: Structure hidden states
	attention_mask: Boolean mask (True = can attend, False = cannot attend)

	Returns:
	Updated sequence hidden states
	"""
	batch_size, seq_len, _ = seq_hidden.shape
	struct_len = struct_hidden.shape[1]

	# Project to Q, K, V
	Q = self.query(seq_hidden) # (batch, seq_len, hidden)
	K = self.key(struct_hidden) # (batch, struct_len, hidden)
	V = self.value(struct_hidden) # (batch, struct_len, hidden)

	# Reshape for multi-head attention
	Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # (batch, heads, seq_len, head_dim)
	K = K.view(batch_size, struct_len, self.num_heads, self.head_dim).transpose(1, 2) # (batch, heads, struct_len, head_dim)
	V = V.view(batch_size, struct_len, self.num_heads, self.head_dim).transpose(1, 2) # (batch, heads, struct_len, head_dim)

	# Compute attention scores
	scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim) # (batch, heads, seq_len, struct_len)

	# Apply attention mask
	if attention_mask is not None:
	# attention_mask: (batch, seq_len, struct_len) -> (batch, 1, seq_len, struct_len)
	attention_mask = attention_mask.unsqueeze(1)
	# Convert boolean mask to float: True -> 0.0, False -> -10000.0
	attention_mask = (~attention_mask).float() * -10000.0
	scores = scores + attention_mask

	# Softmax and dropout
	attn_weights = torch.softmax(scores, dim=-1) # (batch, heads, seq_len, struct_len)
	attn_weights = self.dropout(attn_weights)

	# Apply attention to values
	context = torch.matmul(attn_weights, V) # (batch, heads, seq_len, head_dim)

	# Reshape back
	context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_size)

	# Output projection
	output = self.output(context)
	output = self.dropout(output)

	# Residual connection + layer norm
	output = self.layer_norm(seq_hidden + output)

	return output


	class MultimodalGlycanBERT(nn.Module):
	"""
	BERTose model for glycan representation learning.

	Architecture:
	1. Separate encoders for each modality (sequence, MS, 3D structure)
	2. Cross-attention for sequence-structure alignment
	3. Modality-specific MLM heads
	4. Fusion layer for combined representation
	"""

	def __init__(self, config: MultimodalGlycanBERTConfig):
	super().__init__()
	self.config = config

	# ===== Sequence Encoder =====
	seq_config = config.to_seq_config()
	seq_config.cnn_kernel_size = config.cnn_kernel_size

	if config.use_cnn_frontend:
	print(f"Enabled convolutional front-end (kernel={config.cnn_kernel_size})")
	self.seq_embeddings = ConvGlycanBERTEmbeddings(seq_config)
	else:
	self.seq_embeddings = GlycanBERTEmbeddings(seq_config)
	self.seq_layers = nn.ModuleList([GlycanBERTLayer(seq_config) for _ in range(seq_config.num_hidden_layers)])
	self.seq_mlm_head = nn.Linear(seq_config.hidden_size, seq_config.vocab_size)

	# ===== MS Encoder =====
	ms_config = config.to_ms_config()
	self.ms_embeddings = GlycanBERTEmbeddings(ms_config)
	self.ms_layers = nn.ModuleList([GlycanBERTLayer(ms_config) for _ in range(ms_config.num_hidden_layers)])
	self.ms_mlm_head = nn.Linear(ms_config.hidden_size, ms_config.vocab_size)

	# ===== Structure Encoder (VQ-VAE tokens) =====
	if config.use_3d:
	struct_config = config.to_struct_config()
	self.struct_embeddings = GlycanBERTEmbeddings(struct_config)
	self.struct_layers = nn.ModuleList([GlycanBERTLayer(struct_config) for _ in range(struct_config.num_hidden_layers)])
	self.struct_mlm_head = nn.Linear(struct_config.hidden_size, struct_config.vocab_size)

	# Cross-attention layer (sequence → VQ-VAE structural tokens)
	if config.use_cross_attention:
	self.cross_attention = CrossAttentionLayer(config)

	# ===== Projection layers (align hidden sizes) =====
	if config.ms_hidden_size != config.seq_hidden_size:
	self.ms_projection = nn.Linear(config.ms_hidden_size, config.seq_hidden_size)
	else:
	self.ms_projection = nn.Identity()

	if config.use_3d and config.struct_hidden_size != config.seq_hidden_size:
	self.struct_projection = nn.Linear(config.struct_hidden_size, config.seq_hidden_size)
	else:
	self.struct_projection = nn.Identity()

	# ===== Fusion Layer =====
	# Concatenate seq + ms + struct
	fusion_input_size = config.seq_hidden_size * (3 if config.use_3d else 2)
	self.fusion_layer = nn.Sequential(
	nn.Linear(fusion_input_size, config.fusion_hidden_size),
	nn.LayerNorm(config.fusion_hidden_size, eps=config.layer_norm_eps),
	nn.GELU(),
	nn.Dropout(config.hidden_dropout_prob),
	nn.Linear(config.fusion_hidden_size, config.fusion_hidden_size),
	)

	# ===== Distance Prediction Head (Topology) =====
	# Project down to 128 dimensions first to reduce memory use.
	# (Batch, 256, 256, 768) -> (Batch, 256, 256, 128) reduces memory by 6x
	self.dist_proj = nn.Linear(config.seq_hidden_size, 128)
	self.distance_head = nn.Sequential(
	nn.Linear(128, 64),
	nn.ReLU(),
	nn.Linear(64, 1)
	)

	# Initialize weights
	self.apply(self._init_weights)

	def _init_weights(self, module):
	"""Initialize weights."""
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)

	def forward(
	self,
	seq_token_ids: torch.Tensor,
	seq_attention_mask: torch.Tensor,
	seq_residue_ids: torch.Tensor,
	seq_branch_depths: Optional[torch.Tensor] = None,
	seq_linkage_types: Optional[torch.Tensor] = None,
	ms_token_ids: torch.Tensor = None,
	ms_attention_mask: torch.Tensor = None,
	has_ms: torch.Tensor = None,
	struct_token_ids: Optional[torch.Tensor] = None,
	struct_attention_mask: Optional[torch.Tensor] = None,
	struct_residue_ids: Optional[torch.Tensor] = None,
	has_3d: Optional[torch.Tensor] = None,
	seq_labels: Optional[torch.Tensor] = None,
	ms_labels: Optional[torch.Tensor] = None,
	struct_labels: Optional[torch.Tensor] = None,
	dist_labels: Optional[torch.Tensor] = None,
	return_dict: bool = True,
	) -> Dict[str, torch.Tensor]:
	"""
	Forward pass for BERTose.

	Args:
	seq_token_ids: (batch_size, seq_len) - Sequence token IDs
	seq_attention_mask: (batch_size, seq_len) - Sequence attention mask
	seq_residue_ids: (batch_size, seq_len) - Sequence token residue IDs
	ms_token_ids: (batch_size, ms_len) - MS token IDs
	ms_attention_mask: (batch_size, ms_len) - MS attention mask
	has_ms: (batch_size,) - Boolean mask for samples with MS data
	struct_token_ids: (batch_size, struct_len) - Structure VQ-VAE token IDs (optional)
	struct_attention_mask: (batch_size, struct_len) - Structure attention mask (optional)
	struct_residue_ids: (batch_size, struct_len) - Structure token residue IDs (optional)
	has_3d: (batch_size,) - Boolean mask for samples with 3D data (optional)
	seq_labels: (batch_size, seq_len) - Masked sequence labels (optional)
	ms_labels: (batch_size, ms_len) - Masked MS labels (optional)
	struct_labels: (batch_size, struct_len) - Masked structure labels (optional)
	return_dict: Whether to return dict or tuple

	Returns:
	Dictionary containing logits, hidden states, losses, etc.
	"""
	batch_size = seq_token_ids.shape[0]
	device = seq_token_ids.device

	# ===== Sequence Encoder =====
	# Pass branch_depths and linkage_types to embeddings for tree-aware encoding
	seq_hidden = self.seq_embeddings(seq_token_ids, seq_branch_depths, seq_linkage_types)
	for layer in self.seq_layers:
	seq_hidden = layer(seq_hidden, seq_attention_mask)

	seq_pooled = seq_hidden[:, 0, :] # [CLS] token
	seq_logits = self.seq_mlm_head(seq_hidden)

	# ===== Distance Predictions (Topology) =====
	# Compute pairwise distance predictions
	# MEMORY OPTIMIZATION: Project to 128-dim first
	seq_hidden_small = self.dist_proj(seq_hidden) # (batch, seq_len, 128)

	# Expand for pairwise: (batch, seq_len, 1, 128) - (batch, 1, seq_len, 128)
	h_i = seq_hidden_small.unsqueeze(2)
	h_j = seq_hidden_small.unsqueeze(1)
	h_diff = torch.abs(h_i - h_j) # (batch, seq_len, seq_len, 128) - Much smaller!
	dist_predictions = self.distance_head(h_diff) # (batch, seq_len, seq_len, 1)

	# ===== MS Encoder =====
	ms_hidden = None
	ms_pooled = None
	ms_logits = None

	if ms_token_ids is not None:
	ms_hidden = self.ms_embeddings(ms_token_ids)
	for layer in self.ms_layers:
	ms_hidden = layer(ms_hidden, ms_attention_mask)

	ms_pooled = ms_hidden[:, 0, :] # [CLS] token
	ms_logits = self.ms_mlm_head(ms_hidden)

	# Zero out MS representations for samples without MS data
	if has_ms is not None:
	has_ms_expanded = has_ms.unsqueeze(1).float() # (batch, 1)
	ms_pooled = ms_pooled * has_ms_expanded

	# ===== Structure Encoder =====
	struct_pooled = None
	struct_logits = None
	struct_hidden = None

	if self.config.use_3d and struct_token_ids is not None:
	struct_hidden = self.struct_embeddings(struct_token_ids)
	for layer in self.struct_layers:
	struct_hidden = layer(struct_hidden, struct_attention_mask)

	struct_pooled = struct_hidden[:, 0, :] # [CLS] token
	struct_logits = self.struct_mlm_head(struct_hidden)

	# Zero out structure representations for samples without 3D data
	if has_3d is not None:
	has_3d_expanded = has_3d.unsqueeze(1).float() # (batch, 1)
	struct_pooled = struct_pooled * has_3d_expanded

	# ===== Cross-Attention (Sequence → VQ-VAE Structural Tokens) =====
	# Use residue-level alignment between WURCS tokens and VQ-VAE tokens
	if self.config.use_cross_attention and struct_residue_ids is not None:
	# Create residue-level mask
	# WURCS token with residue_id=0 → VQ-VAE tokens with residue_id=0
	residue_mask = create_residue_level_mask(
	seq_residue_ids=seq_residue_ids,
	struct_residue_ids=struct_residue_ids,
	) # (batch, N_seq, N_struct)

	# Apply cross-attention: sequence tokens attend to VQ-VAE tokens
	seq_hidden = self.cross_attention(
	seq_hidden=seq_hidden,
	struct_hidden=struct_hidden, # VQ-VAE token features
	attention_mask=residue_mask, # Residue-based mask
	)

	# Update seq_pooled after cross-attention
	seq_pooled = seq_hidden[:, 0, :]

	# ===== Fusion =====
	# Project to common hidden size
	ms_pooled_projected = self.ms_projection(ms_pooled)

	if self.config.use_3d and struct_pooled is not None:
	struct_pooled_projected = self.struct_projection(struct_pooled)
	combined = torch.cat([seq_pooled, ms_pooled_projected, struct_pooled_projected], dim=-1)
	else:
	combined = torch.cat([seq_pooled, ms_pooled_projected], dim=-1)

	fused_repr = self.fusion_layer(combined)

	# ===== Compute Losses =====
	total_loss = None
	seq_loss = None
	ms_loss = None
	struct_loss = None
	dist_loss = None

	if seq_labels is not None:
	loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
	seq_loss = loss_fct(
	seq_logits.view(-1, self.config.seq_vocab_size),
	seq_labels.view(-1)
	)

	if ms_labels is not None:
	ms_labels_masked = ms_labels.clone()
	ms_labels_masked[~has_ms] = -100
	# Only compute loss if there are valid labels (not all -100)
	if (ms_labels_masked != -100).any():
	loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
	ms_loss = loss_fct(
	ms_logits.view(-1, self.config.ms_total_vocab_size),
	ms_labels_masked.view(-1)
	)
	else:
	ms_loss = torch.tensor(0.0, device=seq_token_ids.device)

	if self.config.use_3d and struct_labels is not None and struct_logits is not None:
	struct_labels_masked = struct_labels.clone()
	if has_3d is not None:
	struct_labels_masked[~has_3d] = -100
	# Only compute loss if there are valid labels (not all -100)
	if (struct_labels_masked != -100).any():
	loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
	struct_loss = loss_fct(
	struct_logits.view(-1, self.config.struct_vocab_size),
	struct_labels_masked.view(-1)
	)
	else:
	struct_loss = torch.tensor(0.0, device=seq_token_ids.device)

	# ===== Distance Loss (Topology) =====
	if dist_labels is not None:
	# dist_predictions: (Batch, Seq, Seq, 1) -> (Batch, Seq, Seq)
	preds = dist_predictions.squeeze(-1)

	# Create mask for valid distance pairs (label != -1)
	# Also respect attention mask to avoid padding
	valid_mask = (dist_labels != -1) & (seq_attention_mask.unsqueeze(1) * seq_attention_mask.unsqueeze(2) == 1)

	# DEBUG: Print once
	if not hasattr(self, '_dist_debug_printed'):
	print(f"[DIST DEBUG] dist_labels shape: {dist_labels.shape}, valid_mask.sum: {valid_mask.sum().item()}")
	self._dist_debug_printed = True

	if valid_mask.sum() > 0:
	# MSE loss on valid positions only
	loss_fct = nn.MSELoss()
	dist_loss = loss_fct(preds[valid_mask], dist_labels[valid_mask].float())
	else:
	dist_loss = torch.tensor(0.0, device=seq_token_ids.device)
	else:
	# DEBUG: dist_labels is None
	if not hasattr(self, '_dist_none_printed'):
	print("[DIST DEBUG] dist_labels is None!")
	self._dist_none_printed = True

	# Weighted combination
	losses = []
	if seq_loss is not None:
	losses.append(self.config.seq_loss_weight * seq_loss)
	if ms_loss is not None:
	losses.append(self.config.ms_loss_weight * ms_loss)
	if struct_loss is not None:
	losses.append(self.config.struct_loss_weight * struct_loss)
	if dist_loss is not None:
	losses.append(self.config.dist_loss_weight * dist_loss)

	if losses:
	total_loss = sum(losses)

	if return_dict:
	return {
	'loss': total_loss,
	'seq_loss': seq_loss,
	'ms_loss': ms_loss,
	'struct_loss': struct_loss,
	'dist_loss': dist_loss,
	'seq_logits': seq_logits,
	'ms_logits': ms_logits,
	'struct_logits': struct_logits,
	'dist_predictions': dist_predictions,
	'seq_hidden': seq_hidden,
	'ms_hidden': ms_hidden,
	'struct_hidden': struct_hidden,
	'seq_pooled': seq_pooled,
	'ms_pooled': ms_pooled,
	'struct_pooled': struct_pooled,
	'fused_repr': fused_repr,
	}
	else:
	return (total_loss, seq_logits, ms_logits, struct_logits, fused_repr)

	def get_multimodal_representation(
	self,
	seq_token_ids: torch.Tensor,
	seq_attention_mask: torch.Tensor,
	seq_residue_ids: torch.Tensor,
	ms_token_ids: torch.Tensor,
	ms_attention_mask: torch.Tensor,
	has_ms: torch.Tensor,
	struct_token_ids: Optional[torch.Tensor] = None,
	struct_attention_mask: Optional[torch.Tensor] = None,
	struct_residue_ids: Optional[torch.Tensor] = None,
	has_3d: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Get fused multimodal representation (for inference)."""
	outputs = self.forward(
	seq_token_ids=seq_token_ids,
	seq_attention_mask=seq_attention_mask,
	seq_residue_ids=seq_residue_ids,
	ms_token_ids=ms_token_ids,
	ms_attention_mask=ms_attention_mask,
	has_ms=has_ms,
	struct_token_ids=struct_token_ids,
	struct_attention_mask=struct_attention_mask,
	struct_residue_ids=struct_residue_ids,
	has_3d=has_3d,
	return_dict=True,
	)
	return outputs['fused_repr']


	if __name__ == "__main__":
	# Test the model
	print("="*80)
	print("Testing BERTose model")
	print("="*80)

	# Create config
	config = MultimodalGlycanBERTConfig(
	seq_vocab_size=166,
	seq_hidden_size=768,
	seq_num_layers=12,
	seq_num_heads=12,
	ms_vocab_size=242,
	ms_hidden_size=384,
	ms_num_layers=6,
	ms_num_heads=6,
	struct_vocab_size=1024,
	struct_hidden_size=512,
	struct_num_layers=8,
	struct_num_heads=8,
	use_3d=True,
	use_cross_attention=True,
	seq_loss_weight=0.60,
	ms_loss_weight=0.15,
	struct_loss_weight=0.25,
	)

	print(f"\nConfig:")
	print(f" Sequence vocab: {config.seq_vocab_size}")
	print(f" MS vocab: {config.ms_vocab_size}")
	print(f" Structure vocab: {config.struct_vocab_size}")
	print(f" Loss weights: seq={config.seq_loss_weight}, ms={config.ms_loss_weight}, struct={config.struct_loss_weight}")

	# Create model
	model = MultimodalGlycanBERT(config)

	# Count parameters
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

	print(f"\nModel Parameters:")
	print(f" Total: {total_params:,}")
	print(f" Trainable: {trainable_params:,}")

	# Test forward pass
	print(f"\n{'='*80}")
	print("Testing Forward Pass (with Conv front-end)")
	print("="*80)

	batch_size = 4
	seq_len = 128
	ms_len = 50
	struct_len = 40

	# Create dummy inputs
	seq_token_ids = torch.randint(0, config.seq_vocab_size, (batch_size, seq_len))
	seq_attention_mask = torch.ones(batch_size, seq_len)
	# Approximate: ~5 tokens per residue
	seq_residue_ids = torch.div(
	torch.arange(seq_len), 5, rounding_mode="floor"
	).unsqueeze(0).expand(batch_size, -1)

	ms_token_ids = torch.randint(config.ms_vocab_offset, config.ms_total_vocab_size, (batch_size, ms_len))
	ms_attention_mask = torch.ones(batch_size, ms_len)
	struct_token_ids = torch.randint(0, config.struct_vocab_size, (batch_size, struct_len))
	struct_attention_mask = torch.ones(batch_size, struct_len)
	# Approximate: 4 tokens per residue for VQ-VAE tokens
	struct_residue_ids = torch.div(
	torch.arange(struct_len), 4, rounding_mode="floor"
	).unsqueeze(0).expand(batch_size, -1)

	has_ms = torch.tensor([True, True, False, True])
	has_3d = torch.tensor([True, False, True, True])

	# Create labels for MLM
	seq_labels = seq_token_ids.clone()
	seq_labels[seq_labels != config.mask_token_id] = -100
	ms_labels = ms_token_ids.clone()
	ms_labels[ms_labels != config.mask_token_id] = -100
	struct_labels = struct_token_ids.clone()
	struct_labels[struct_labels != config.mask_token_id] = -100

	# Forward pass
	outputs = model(
	seq_token_ids=seq_token_ids,
	seq_attention_mask=seq_attention_mask,
	seq_residue_ids=seq_residue_ids,
	ms_token_ids=ms_token_ids,
	ms_attention_mask=ms_attention_mask,
	has_ms=has_ms,
	struct_token_ids=struct_token_ids,
	struct_attention_mask=struct_attention_mask,
	struct_residue_ids=struct_residue_ids,
	has_3d=has_3d,
	seq_labels=seq_labels,
	ms_labels=ms_labels,
	struct_labels=struct_labels,
	)

	print(f"\nOutput shapes:")
	print(f" seq_logits: {outputs['seq_logits'].shape}")
	print(f" ms_logits: {outputs['ms_logits'].shape}")
	print(f" struct_logits: {outputs['struct_logits'].shape}")
	print(f" fused_repr: {outputs['fused_repr'].shape}")

	print(f"\nLosses:")
	print(f" Total loss: {outputs['loss'].item():.4f}")
	print(f" Sequence loss: {outputs['seq_loss'].item():.4f}")
	print(f" MS loss: {outputs['ms_loss'].item():.4f}")
	print(f" Structure loss: {outputs['struct_loss'].item():.4f}")

	print(f"\n{'='*80}")
	print("Model Test Complete!")
	print("="*80)