Upload FlashPPI model

Browse files

Files changed (10) hide show

__KMP_REGISTERED_LIB_23805 +0 -0
__KMP_REGISTERED_LIB_91112 +0 -0
config.json +25 -0
configuration_flashppi.py +44 -0
glm_tokenizer.py +48 -0
model.safetensors +3 -0
modeling_flashppi.py +560 -0
special_tokens_map.json +44 -0
tokenizer.json +133 -0
tokenizer_config.json +84 -0

__KMP_REGISTERED_LIB_23805 ADDED Viewed

Binary file (1.02 kB). View file

__KMP_REGISTERED_LIB_91112 ADDED Viewed

Binary file (1.02 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "FlashPPIModel"
+  ],
+  "clip_embed_dim": 1024,
+  "contact_embed_dim": 1280,
+  "contact_num_heads": 8,
+  "contact_transformer_depth": 2,
+  "dtype": "float32",
+  "max_position_embeddings": 512,
+  "model_type": "flashppi",
+  "plm_depth": 33,
+  "plm_dim": 1280,
+  "plm_ffn_dim_multiplier": null,
+  "plm_heads": 20,
+  "plm_norm_eps": 1e-05,
+  "plm_swiglu_multiple_of": 256,
+  "plm_vocab_size": 37,
+  "transformers_version": "4.57.1",
+  "use_flash_attention": true,
+  "auto_map": {
+    "AutoConfig": "configuration_flashppi.FlashPPIConfig",
+    "AutoModel": "modeling_flashppi.FlashPPIModel"
+  }
+}

configuration_flashppi.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""FlashPPI model configuration."""
+from transformers import PretrainedConfig
+class FlashPPIConfig(PretrainedConfig):
+    model_type = "flashppi"
+    def __init__(
+        self,
+        # gLM2 backbone config (defaults match gLM2_650M)
+        plm_dim: int = 1280,
+        plm_depth: int = 33,
+        plm_heads: int = 20,
+        plm_vocab_size: int = 37,
+        plm_norm_eps: float = 1e-5,
+        plm_swiglu_multiple_of: int = 256,
+        plm_ffn_dim_multiplier: float = None,
+        # FlashPPI head config
+        clip_embed_dim: int = 1024,
+        contact_embed_dim: int = 1280,
+        contact_num_heads: int = 8,
+        contact_transformer_depth: int = 2,
+        max_position_embeddings: int = 512,
+        use_flash_attention: bool = True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # gLM2 config
+        self.plm_dim = plm_dim
+        self.plm_depth = plm_depth
+        self.plm_heads = plm_heads
+        self.plm_vocab_size = plm_vocab_size
+        self.plm_norm_eps = plm_norm_eps
+        self.plm_swiglu_multiple_of = plm_swiglu_multiple_of
+        self.plm_ffn_dim_multiplier = plm_ffn_dim_multiplier
+        # FlashPPI config
+        self.clip_embed_dim = clip_embed_dim
+        self.contact_embed_dim = contact_embed_dim
+        self.contact_num_heads = contact_num_heads
+        self.contact_transformer_depth = contact_transformer_depth
+        self.max_position_embeddings = max_position_embeddings
+        self.use_flash_attention = use_flash_attention

glm_tokenizer.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from transformers import PreTrainedTokenizerFast
+class gLM2Tokenizer(PreTrainedTokenizerFast):
+    VOCAB = [
+        "<cls>", "<pad>", "<eos>", "<unk>",
+        "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K",
+        "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z",
+        "O", "a", "t", "c", "g", "<+>", "<->", "<mask>", "<sep>",
+    ]
+    def __init__(
+        self,
+        unk_token="<unk>",
+        cls_token="<cls>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        eos_token="<eos>",
+        sep_token="<sep>",
+        pos_token="<+>",
+        neg_token="<->",
+        **kwargs,
+    ):
+        all_tokens = self.VOCAB
+        token_to_id = {tok: ind for ind, tok in enumerate(all_tokens)}
+        bpe = BPE(token_to_id, merges=[], unk_token=str(unk_token))
+        tokenizer = Tokenizer(bpe)
+        special_tokens = [cls_token, pad_token,
+                          mask_token, eos_token, sep_token, pos_token, neg_token]
+        tokenizer.add_special_tokens(
+            special_tokens,
+        )
+        super().__init__(
+            tokenizer_object=tokenizer,
+            unk_token=unk_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            **kwargs,
+        )

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:783abc99f0d39c350d9be2e553dbf407b9ebc0fa1d288b31f418b4a3ef223f2c
+size 2931379208

modeling_flashppi.py ADDED Viewed

	@@ -0,0 +1,560 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+from einops import rearrange, repeat
+from torch.utils.checkpoint import checkpoint
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+from .configuration_flashppi import FlashPPIConfig
+# Detect Flash Attention installation
+try:
+    from flash_attn.ops.activations import swiglu
+    from flash_attn.layers.rotary import apply_rotary_emb_func
+    from flash_attn import flash_attn_varlen_kvpacked_func
+    from flash_attn.bert_padding import pad_input, unpad_input
+    from flash_attn.ops.triton.layer_norm import RMSNorm
+    FLASH_ATTN_AVAILABLE = True
+except ImportError:
+    FLASH_ATTN_AVAILABLE = False
+    unpad_input = pad_input = apply_rotary_emb_func = None
+    flash_attn_varlen_kvpacked_func = None
+    def swiglu(x, y):
+        return F.silu(x) * y
+    class RMSNorm(nn.Module):
+        """RMSNorm without variance_epsilon buffer for checkpoint compatibility."""
+        def __init__(self, dim, eps=1e-6):
+            super().__init__()
+            self.weight = nn.Parameter(torch.ones(dim))
+            self.eps = eps
+        def forward(self, hidden_states):
+            input_dtype = hidden_states.dtype
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+            return (self.weight * hidden_states).to(input_dtype)
+@dataclass
+class FlashPPIOutput(ModelOutput):
+    """Output type for FlashPPI model.
+    Args:
+        contact_map: (B, L1, L2) contact probabilities between residue pairs.
+        contact_score: (B,) maximum contact probability per pair.
+        clip_embed1: (B, D) CLIP embedding for first protein.
+        clip_embed2: (B, D) CLIP embedding for second protein.
+        clip_score: (B,) CLIP similarity score (cosine similarity).
+    """
+    contact_map: Optional[torch.FloatTensor] = None
+    contact_score: Optional[torch.FloatTensor] = None
+    clip_embed1: Optional[torch.FloatTensor] = None
+    clip_embed2: Optional[torch.FloatTensor] = None
+    clip_score: Optional[torch.FloatTensor] = None
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False, position_ids=None):
+    """Apply rotary embeddings using pure PyTorch."""
+    if position_ids is not None:
+        cos = cos[position_ids]
+        sin = sin[position_ids]
+    else:
+        cos = cos[:x.shape[1]]
+        sin = sin[:x.shape[1]]
+    if not interleaved:
+        cos = repeat(cos, "... d -> ... 1 (2 d)")
+        sin = repeat(sin, "... d -> ... 1 (2 d)")
+    else:
+        cos = repeat(cos, "... d -> ... 1 (d 2)")
+        sin = repeat(sin, "... d -> ... 1 (d 2)")
+    ro_dim = cos.shape[-1]
+    return torch.cat([
+        x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+        x[..., ro_dim:],
+    ], dim=-1)
+class RotaryEmbedding(nn.Module):
+    """Rotary position embeddings with flash attention support."""
+    def __init__(self, dim: int, base: float = 10000.0, interleaved: bool = False, device=None):
+        super().__init__()
+        self.dim = dim
+        self.base = float(base)
+        self.interleaved = interleaved
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        if seqlen > self._seq_len_cached or self._cos_cached is None or self._cos_cached.device != device:
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=torch.float32)
+            freqs = torch.outer(t, self.inv_freq.to(device=device, dtype=torch.float32))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        seqlen = q.shape[1] if max_seqlen is None else max_seqlen
+        self._update_cos_sin_cache(seqlen, device=q.device, dtype=q.dtype)
+        if FLASH_ATTN_AVAILABLE and cu_seqlens is not None:
+            q = apply_rotary_emb_func(
+                q, self._cos_cached, self._sin_cached,
+                interleaved=self.interleaved, inplace=True,
+                cu_seqlens=cu_seqlens, max_seqlen=max_seqlen,
+            )
+            k = apply_rotary_emb_func(
+                k, self._cos_cached, self._sin_cached,
+                interleaved=self.interleaved, inplace=True,
+                cu_seqlens=cu_seqlens, max_seqlen=max_seqlen,
+            )
+        else:
+            q = apply_rotary_emb_torch(q, self._cos_cached, self._sin_cached, self.interleaved, position_ids)
+            k = apply_rotary_emb_torch(k, self._cos_cached, self._sin_cached, self.interleaved, position_ids)
+        return q, k
+class Attention(nn.Module):
+    """Multi-head attention with optional flash attention."""
+    def __init__(self, dim: int, num_heads: int, use_rope: bool = True):
+        super().__init__()
+        self.n_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.wqkv = nn.Linear(dim, num_heads * self.head_dim * 3, bias=False)
+        self.wo = nn.Linear(num_heads * self.head_dim, dim, bias=False)
+        self.rotary_emb = RotaryEmbedding(self.head_dim) if use_rope else None
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seq_len: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv = self.wqkv(x)
+        if cu_seqlens is not None and FLASH_ATTN_AVAILABLE:
+            # Flash attention path (unpadded)
+            total_seqlen = x.shape[0]
+            q, k, v = torch.split(qkv, self.n_heads * self.head_dim, dim=-1)
+            q = q.view(total_seqlen, self.n_heads, self.head_dim)
+            k = k.view(total_seqlen, self.n_heads, self.head_dim)
+            v = v.view(total_seqlen, self.n_heads, self.head_dim)
+            if self.rotary_emb is not None:
+                q, k = self.rotary_emb(q, k, cu_seqlens=cu_seqlens, max_seqlen=max_seq_len)
+            kv = torch.stack([k, v], 1)
+            output = flash_attn_varlen_kvpacked_func(
+                q, kv,
+                cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seq_len, max_seqlen_k=max_seq_len,
+                dropout_p=0.0, causal=False,
+            )
+            output = output.view(total_seqlen, self.n_heads * self.head_dim)
+        else:
+            # SDPA path (padded)
+            bsz, seqlen, _ = x.shape
+            q, k, v = torch.split(qkv, self.n_heads * self.head_dim, dim=-1)
+            q = q.view(bsz, seqlen, self.n_heads, self.head_dim)
+            k = k.view(bsz, seqlen, self.n_heads, self.head_dim)
+            v = v.view(bsz, seqlen, self.n_heads, self.head_dim)
+            if self.rotary_emb is not None:
+                q, k = self.rotary_emb(q, k, position_ids=position_ids)
+            q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+            attn_mask = None
+            if attention_mask is not None:
+                attn_mask = attention_mask.unsqueeze(1).unsqueeze(2).bool()
+            output = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+            output = output.transpose(1, 2).contiguous().view(bsz, seqlen, self.n_heads * self.head_dim)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    """SwiGLU feedforward network."""
+    def __init__(self, dim: int, hidden_mult: float = 4.0, multiple_of: int = 256, ffn_dim_multiplier: float = None):
+        super().__init__()
+        hidden_dim = int(2 * dim * hidden_mult / 3)
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(swiglu(self.w1(x), self.w3(x)))
+class TransformerBlock(nn.Module):
+    """Pre-norm transformer block."""
+    def __init__(self, dim: int, num_heads: int, norm_eps: float = 1e-6,
+                 multiple_of: int = 256, ffn_dim_multiplier: float = None, use_rope: bool = True):
+        super().__init__()
+        self.attention = Attention(dim, num_heads, use_rope)
+        self.feed_forward = FeedForward(dim, multiple_of=multiple_of, ffn_dim_multiplier=ffn_dim_multiplier)
+        self.attention_norm = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm = RMSNorm(dim, eps=norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seq_len: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        h = x + self.attention(self.attention_norm(x), cu_seqlens, max_seq_len, attention_mask, position_ids)
+        return h + self.feed_forward(self.ffn_norm(h))
+class TransformerLayers(nn.Module):
+    """Stack of transformer blocks with optional flash attention."""
+    def __init__(self, dim: int, num_heads: int, depth: int, norm_eps: float = 1e-6,
+                 multiple_of: int = 256, ffn_dim_multiplier: float = None, use_rope: bool = True):
+        super().__init__()
+        self.dim = dim
+        self.layers = nn.ModuleList([
+            TransformerBlock(dim, num_heads, norm_eps, multiple_of, ffn_dim_multiplier, use_rope)
+            for _ in range(depth)
+        ])
+        self.gradient_checkpointing = False
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, seq_len = x.shape[:2]
+        cu_seqlens, max_seq_len_in_batch, indices, position_ids = None, None, None, None
+        if FLASH_ATTN_AVAILABLE and attention_mask is not None and not attention_mask.all():
+            x, indices, cu_seqlens, max_seq_len_in_batch, _ = unpad_input(x, attention_mask)
+            mask_for_layers = None
+        elif attention_mask is not None:
+            mask_long = attention_mask.long()
+            position_ids = (mask_long.cumsum(dim=1) - 1).clamp(min=0)
+            mask_for_layers = attention_mask
+        else:
+            mask_for_layers = None
+        for layer in self.layers:
+            if self.training and self.gradient_checkpointing:
+                x = checkpoint(layer, x, cu_seqlens, max_seq_len_in_batch, mask_for_layers, position_ids, use_reentrant=False)
+            else:
+                x = layer(x, cu_seqlens, max_seq_len_in_batch, mask_for_layers, position_ids)
+        if FLASH_ATTN_AVAILABLE and indices is not None:
+            x = pad_input(x, indices, batch_size, seq_len)
+        return x
+class GLM2Backbone(nn.Module):
+    """gLM2 protein language model backbone."""
+    def __init__(self, config: FlashPPIConfig):
+        super().__init__()
+        self.config = config
+        self.tok_embeddings = nn.Embedding(config.plm_vocab_size, config.plm_dim)
+        self.encoder = TransformerLayers(
+            dim=config.plm_dim,
+            num_heads=config.plm_heads,
+            depth=config.plm_depth,
+            norm_eps=config.plm_norm_eps,
+            multiple_of=config.plm_swiglu_multiple_of,
+            ffn_dim_multiplier=config.plm_ffn_dim_multiplier,
+            use_rope=True,
+        )
+    def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        h = self.tok_embeddings(input_ids)
+        return self.encoder(h, attention_mask)
+class MLPHead(nn.Module):
+    """SwiGLU MLP projection head."""
+    def __init__(self, in_dim: int, out_dim: int, hidden_mult: float = 2.0):
+        super().__init__()
+        hidden_dim = int(in_dim * hidden_mult)
+        self.w1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, out_dim, bias=False)
+        self.w3 = nn.Linear(in_dim, hidden_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class ContrastiveHead(nn.Module):
+    """CLIP-style contrastive head with mean pooling."""
+    def __init__(self, hidden_dim: int, embed_dim: int):
+        super().__init__()
+        self.head = MLPHead(hidden_dim, embed_dim)
+    def forward(self, residue_embeds: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        mask = mask.unsqueeze(-1).bool()
+        embeds = torch.where(mask, residue_embeds, 0.0)
+        embeds = embeds.sum(dim=1) / mask.sum(dim=1).float().clamp(min=1.0)
+        return F.normalize(self.head(embeds), dim=-1)
+class ContactHead(nn.Module):
+    """Contact prediction head using cross-attention between protein pairs."""
+    def __init__(self, input_dim: int, contact_dim: int, num_heads: int = 8, depth: int = 2):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = contact_dim // num_heads
+        assert contact_dim % num_heads == 0
+        self.segment_embed = nn.Embedding(2, input_dim)
+        nn.init.normal_(self.segment_embed.weight, std=0.02)
+        self.transformer = TransformerLayers(input_dim, num_heads, depth, use_rope=True)
+        self.norm = nn.LayerNorm(input_dim)
+        self.q_proj = nn.Linear(input_dim, contact_dim, bias=True)
+        self.k_proj = nn.Linear(input_dim, contact_dim, bias=True)
+        self.output_mix = nn.Linear(num_heads, 1)
+        nn.init.constant_(self.output_mix.bias, -3.0)
+        self.scale = self.head_dim ** -0.5
+    def forward(
+        self,
+        embed1: torch.Tensor,
+        embed2: torch.Tensor,
+        mask1: torch.Tensor,
+        mask2: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, L1, D = embed1.shape
+        _, L2, _ = embed2.shape
+        seg1 = self.segment_embed(torch.zeros(L1, device=embed1.device, dtype=torch.long))
+        seg2 = self.segment_embed(torch.ones(L2, device=embed1.device, dtype=torch.long))
+        x = torch.cat([embed1 + seg1.unsqueeze(0), embed2 + seg2.unsqueeze(0)], dim=1)
+        combined_mask = torch.cat([mask1, mask2], dim=1).bool() if mask1 is not None and mask2 is not None else None
+        x = self.transformer(x, attention_mask=combined_mask)
+        embed1 = self.norm(x[:, :L1, :])
+        embed2 = self.norm(x[:, L1:, :])
+        q = self.q_proj(embed1).view(B, L1, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(embed2).view(B, L2, self.num_heads, self.head_dim).transpose(1, 2)
+        attn_logits = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        attn_logits = attn_logits.permute(0, 2, 3, 1).contiguous()
+        contact_logits = self.output_mix(attn_logits).squeeze(-1)
+        if mask1 is not None and mask2 is not None:
+            valid_mask = (mask1.unsqueeze(2) * mask2.unsqueeze(1)).bool()
+        else:
+            valid_mask = torch.ones_like(contact_logits, dtype=torch.bool)
+        return contact_logits, valid_mask
+class FlashPPIPreTrainedModel(PreTrainedModel):
+    """Base class for FlashPPI models."""
+    config_class = FlashPPIConfig
+    base_model_prefix = "flashppi"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=0.02)
+        elif isinstance(module, RotaryEmbedding):
+            # Re-calculate the frequencies using the module's stored attributes
+            inv_freq = 1.0 / (
+                module.base
+                ** (
+                    torch.arange(0, module.dim, 2, device=module.inv_freq.device, dtype=torch.float32)
+                    / module.dim
+                )
+            )
+            # Force the buffer to update
+            with torch.no_grad():
+                module.inv_freq.copy_(inv_freq)
+class FlashPPIModel(FlashPPIPreTrainedModel):
+    """FlashPPI model."""
+    def __init__(self, config: FlashPPIConfig):
+        super().__init__(config)
+        self.config = config
+        # gLM2 backbone
+        self.plm = GLM2Backbone(config)
+        # CLIP heads (asymmetric for query/key)
+        self.head_q = ContrastiveHead(config.plm_dim, config.clip_embed_dim)
+        self.head_k = ContrastiveHead(config.plm_dim, config.clip_embed_dim)
+        self.logit_scale = nn.Parameter(torch.ones([]) * 2.6593)  # ln(1/0.07)
+        # Contact prediction head
+        self.contact_head = ContactHead(
+            config.plm_dim,
+            config.contact_embed_dim,
+            num_heads=config.contact_num_heads,
+            depth=config.contact_transformer_depth,
+        )
+        self.post_init()
+    def encode_protein(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Encode a protein sequence to residue-level embeddings.
+        Args:
+            input_ids: (B, L) token IDs from gLM2 tokenizer.
+            attention_mask: (B, L) attention mask.
+        Returns:
+            (B, L, plm_dim) residue embeddings.
+        """
+        return self.plm(input_ids, attention_mask)
+    def predict_contacts(
+        self,
+        embed1: torch.Tensor,
+        embed2: torch.Tensor,
+        mask1: torch.Tensor,
+        mask2: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predict contact map from pre-computed residue embeddings.
+        This method is useful for efficient 2-stage inference where embeddings
+        are pre-computed and cached.
+        Args:
+            embed1: (B, L1, D) residue embeddings for protein 1.
+            embed2: (B, L2, D) residue embeddings for protein 2.
+            mask1: (B, L1) attention mask for protein 1.
+            mask2: (B, L2) attention mask for protein 2.
+        Returns:
+            contact_logits: (B, L1, L2) raw logits.
+            valid_mask: (B, L1, L2) mask for valid positions.
+        """
+        return self.contact_head(embed1, embed2, mask1, mask2)
+    def forward(
+        self,
+        input_ids1: torch.Tensor,
+        input_ids2: torch.Tensor,
+        attention_mask1: Optional[torch.Tensor] = None,
+        attention_mask2: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[Tuple, FlashPPIOutput]:
+        """Forward pass for protein pair interaction prediction.
+        Args:
+            input_ids1: (B, L1) token IDs for protein 1.
+            input_ids2: (B, L2) token IDs for protein 2.
+            attention_mask1: (B, L1) attention mask for protein 1.
+            attention_mask2: (B, L2) attention mask for protein 2.
+            return_dict: Whether to return a FlashPPIOutput or tuple.
+        Returns:
+            FlashPPIOutput with contact predictions and CLIP embeddings.
+        """
+        B = input_ids1.shape[0]
+        L1, L2 = input_ids1.shape[1], input_ids2.shape[1]
+        if attention_mask1 is None:
+            attention_mask1 = torch.ones_like(input_ids1)
+        if attention_mask2 is None:
+            attention_mask2 = torch.ones_like(input_ids2)
+        # Encode both proteins in a single batched PLM call for efficiency
+        # Pad to same length if needed
+        if L1 != L2:
+            max_len = max(L1, L2)
+            if L1 < max_len:
+                pad_len = max_len - L1
+                input_ids1 = F.pad(input_ids1, (0, pad_len), value=0)
+                attention_mask1 = F.pad(attention_mask1, (0, pad_len), value=0)
+            if L2 < max_len:
+                pad_len = max_len - L2
+                input_ids2 = F.pad(input_ids2, (0, pad_len), value=0)
+                attention_mask2 = F.pad(attention_mask2, (0, pad_len), value=0)
+        # Batch both sequences for single PLM forward pass
+        batched_input_ids = torch.cat([input_ids1, input_ids2], dim=0)
+        batched_attention_mask = torch.cat([attention_mask1, attention_mask2], dim=0)
+        batched_embeds = self.encode_protein(batched_input_ids, batched_attention_mask)
+        # Split and trim back to original lengths
+        residue_embeds1 = batched_embeds[:B, :L1, :]
+        residue_embeds2 = batched_embeds[B:, :L2, :]
+        attention_mask1 = attention_mask1[:, :L1]
+        attention_mask2 = attention_mask2[:, :L2]
+        # Contrastive embeddings
+        clip_embed1 = self.head_q(residue_embeds1, attention_mask1)
+        clip_embed2 = self.head_k(residue_embeds2, attention_mask2)
+        clip_score = (clip_embed1 * clip_embed2).sum(dim=-1)
+        # Contact prediction
+        contact_logits, valid_mask = self.contact_head(
+            residue_embeds1, residue_embeds2, attention_mask1, attention_mask2
+        )
+        contact_map = torch.sigmoid(contact_logits)
+        # Mask invalid positions before taking max
+        contact_map_masked = contact_map.masked_fill(~valid_mask, 0.0)
+        contact_score = contact_map_masked.flatten(1).max(dim=-1).values
+        if not return_dict:
+            return (contact_map, contact_score, clip_embed1, clip_embed2, clip_score)
+        return FlashPPIOutput(
+            contact_map=contact_map,
+            contact_score=contact_score,
+            clip_embed1=clip_embed1,
+            clip_embed2=clip_embed2,
+            clip_score=clip_score,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "cls_token": {
+    "content": "<cls>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<sep>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<cls>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 33,
+      "content": "<+>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 34,
+      "content": "<->",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 35,
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 36,
+      "content": "<sep>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": null,
+  "post_processor": null,
+  "decoder": null,
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<cls>": 0,
+      "<pad>": 1,
+      "<eos>": 2,
+      "<unk>": 3,
+      "L": 4,
+      "A": 5,
+      "G": 6,
+      "V": 7,
+      "S": 8,
+      "E": 9,
+      "R": 10,
+      "T": 11,
+      "I": 12,
+      "D": 13,
+      "P": 14,
+      "K": 15,
+      "Q": 16,
+      "N": 17,
+      "F": 18,
+      "Y": 19,
+      "M": 20,
+      "H": 21,
+      "W": 22,
+      "C": 23,
+      "X": 24,
+      "B": 25,
+      "U": 26,
+      "Z": 27,
+      "O": 28,
+      "a": 29,
+      "t": 30,
+      "c": 31,
+      "g": 32,
+      "<+>": 33,
+      "<->": 34,
+      "<mask>": 35,
+      "<sep>": 36
+    },
+    "merges": []
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<+>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "<->",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "glm_tokenizer.gLM2Tokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "<sep>",
+  "tokenizer_class": "gLM2Tokenizer",
+  "unk_token": "<unk>"
+}