File size: 5,445 Bytes

e2bfccc

"""Gamma SSM Block with residual connections and normalization."""

import torch
import torch.nn as nn
from typing import Optional, Tuple

from .ssm_gamma import SSMGamma
from .normalization import LayerNorm


class GammaSingleBlock(nn.Module):
    """

    Single Gamma SSM Block with residual connection and layer normalization.

    

    Performs: y = Block(LayerNorm(x)) + x (if prenorm=True)

              or y = LayerNorm(Block(x) + x) (if prenorm=False)

    

    Args:

        d_model: Model dimension

        hidden_dim: Hidden dimension for the SSM state

        delta_t: Time discretization step (default: 0.1)

        kernel_length: Convolution kernel length for future use (default: 4)

        A_type: Type of A matrix initialization (default: "tridiagonal")

        prenorm: Use prenorm (LayerNorm -> Block) vs postnorm (Block -> LayerNorm) (default: True)

        residual_scale: Scaling factor for residual connection (default: 1.0)

        dropout: Dropout rate after block (default: 0.0)

    

    Shape:

        - Input: (batch, seq_len, d_model)

        - Output: (batch, seq_len, d_model)

    """
    
    def __init__(

        self,

        d_model: int,

        hidden_dim: int,

        delta_t: float = 0.1,

        kernel_length: int = 4,

        A_type: str = "tridiagonal",

        prenorm: bool = True,

        residual_scale: float = 1.0,

        dropout: float = 0.0,

    ):
        super().__init__()
        self.d_model = d_model
        self.prenorm = prenorm
        self.dropout_p = dropout
        self.residual_scale = residual_scale
        
        # Normalization
        self.norm = LayerNorm(d_model)
        
        # SSM block
        self.ssm = SSMGamma(
            state_dim=d_model,
            hidden_dim=hidden_dim,
            delta_t=delta_t,
            kernel_length=kernel_length,
            A_type=A_type,
        )
        
        # Dropout
        if dropout > 0:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None
    
    def forward(

        self,

        x: torch.Tensor,

        state: Optional[torch.Tensor] = None,

        mask: Optional[torch.Tensor] = None,

        return_state: bool = True,

    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """

        Forward pass through block.

        

        Args:

            x: Input tensor (batch, seq_len, d_model)

            state: Optional initial hidden state (batch, hidden_dim)

            mask: Optional mask (batch, seq_len) for valid positions

        

        Returns:

            output: (batch, seq_len, d_model)

            final_state: Final hidden state from SSM (batch, hidden_dim)

        """
        if self.prenorm:
            # Apply norm before SSM
            x_norm = self.norm(x)
            ssm_out, final_state = self.ssm(x_norm, mask=mask, state=state)
        else:
            # Apply SSM first, then norm
            ssm_out, final_state = self.ssm(x, mask=mask, state=state)
            ssm_out = self.norm(ssm_out)
        
        # Apply dropout if present
        if self.dropout is not None:
            ssm_out = self.dropout(ssm_out)
        
        # Residual connection with optional scaling
        output = x * self.residual_scale + ssm_out
        
        # Apply final norm if postnorm
        if not self.prenorm:
            output = self.norm(output)

        if not return_state:
            final_state = None
        return output, final_state
    
    def step(self, u: torch.Tensor, h: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """

        Single step inference through block (RNN style).

        

        Args:

            u: Input tensor (batch, d_model) - single timestep

            h: Hidden state (batch, hidden_dim)

        

        Returns:

            output: (batch, d_model) - block output

            h_new: (batch, hidden_dim) - new hidden state

        """
        if self.prenorm:
            # Apply norm before SSM
            u_norm = self.norm(u)
            ssm_out, h_new = self.ssm.step(u_norm, h)
        else:
            # Apply SSM first, then norm
            ssm_out, h_new = self.ssm.step(u, h)
            ssm_out = self.norm(ssm_out)
        
        # Apply dropout if present
        if self.dropout is not None:
            ssm_out = self.dropout(ssm_out)
        
        # Residual connection with optional scaling
        output = u * self.residual_scale + ssm_out
        
        return output, h_new
    
    def allocate_inference_cache(

        self,

        batch_size: int,

        seq_len: int,

        device: torch.device,

        dtype: torch.dtype,

    ):
        """Allocate cache for efficient inference."""
        return self.ssm.allocate_inference_cache(batch_size, seq_len, device, dtype)

    def allocate_deployment_cache(

        self,

        batch_size: int,

        seq_len: int,

        device: torch.device,

        dtype: torch.dtype,

    ):
        return self.allocate_inference_cache(batch_size, seq_len, device, dtype)

    def allocate_balanced_deployment_cache(

        self,

        batch_size: int,

        seq_len: int,

        device: torch.device,

        dtype: torch.dtype,

    ):
        return self.allocate_inference_cache(batch_size, seq_len, device, dtype)