ammarnasr
/

t5mimo-bare

@@ -125,6 +125,69 @@ class T5LayerFF(nn.Module):
         return hidden_states
 class T5Attention(nn.Module):
     def __init__(self, config: T5MIMOConfig, has_relative_attention_bias=False):
         super().__init__()
@@ -1265,7 +1328,7 @@ class T5MIMOForConditionalGeneration(T5PreTrainedModel):
         self.decoder = T5Stack(decoder_config, self.shared)
-        self.conv_block = MultivariateConvBlock(config.num_seqs, config.d_model, num_filters=config.num_filters)
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
@@ -1676,76 +1739,3 @@ class T5MIMOEncoderModel(T5PreTrainedModel):
-class MultivariateConvBlock(nn.Module):
-    def __init__(self, num_seqs, model_dim, kernel_size=3, num_filters=64, stride=1, padding=1):
-        """
-        Multivariate convolutional block to capture cross-sequence interactions and temporal patterns.
-        Args:
-            num_seqs (int): Number of sequences (multivariate time series).
-            model_dim (int): Dimension of each feature vector (typically 256).
-            kernel_size (int): Size of the convolutional kernel. Default is 3.
-            num_filters (int): Number of convolutional filters (output channels). Default is 64.
-            stride (int): Stride of the convolutional kernel. Default is 1.
-            padding (int): Padding for the convolutional kernel. Default is 1 (to preserve sequence length).
-        """
-        super(MultivariateConvBlock, self).__init__()
-        # 2D Convolution across sequences and time
-        self.conv1 = nn.Conv2d(
-            in_channels=num_seqs,
-            out_channels=num_filters,
-            kernel_size=kernel_size,  # Kernel spans across time and all features
-            stride=1,  # Stride across time, no stride across features
-            padding=1  # Padding to preserve sequence length, no padding across features
-        )
-        # Batch normalization for stabilization and faster convergence
-        self.bn1 = nn.BatchNorm2d(num_filters)
-        # Second convolution layer to further model interactions and temporal patterns
-        self.conv2 = nn.Conv2d(
-            in_channels=num_filters,
-            out_channels=num_filters,
-            kernel_size=(kernel_size, 1),  # Focus only on temporal patterns
-            stride=(stride, 1),
-            padding=(padding, 0)
-        )
-        # Batch normalization after second convolution
-        self.bn2 = nn.BatchNorm2d(num_filters)
-        # 1x1 Convolution to reduce the channel dimension back to num_seqs
-        self.conv3 = nn.Conv2d(
-            in_channels=num_filters,
-            out_channels=num_seqs,  # Back to the original number of sequences (channels)
-            kernel_size=(1, 1)
-        )
-    def forward(self, x):
-        """
-        Forward pass of the multivariate convolutional block.
-        Args:
-            x (torch.Tensor): Input tensor of shape [batch_size, num_seqs, seq_len, model_dim].
-        Returns:
-            torch.Tensor: Output tensor of shape [batch_size, num_seqs, seq_len, model_dim].
-        """
-        # Permute to [batch_size, num_seqs, seq_len, model_dim] -> [batch_size, num_seqs, model_dim, seq_len]
-        x = x.permute(0, 1, 3, 2)
-        # Apply first convolution and activation
-        x = nn.functional.relu(self.bn1(self.conv1(x)))
-        # Apply second convolution and activation
-        x = nn.functional.relu(self.bn2(self.conv2(x)))
-        # Reduce channel dimension back to num_seqs
-        x = self.conv3(x)
-        # Permute back to original shape [batch_size, num_seqs, seq_len, model_dim]
-        x = x.permute(0, 1, 3, 2)
-        return x

         return hidden_states
+class MultivariateConvBlock(nn.Module):
+    def __init__(self, config: T5MIMOConfig, kernel_size=3, stride=1, padding=1):
+        super().__init__()
+        # 2D Convolution across sequences and time
+        self.conv1 = nn.Conv2d(
+            in_channels=config.num_seqs,
+            out_channels=config.num_filters,
+            kernel_size=kernel_size,  # Kernel spans across time and all features
+            stride=1,  # Stride across time, no stride across features
+            padding=1  # Padding to preserve sequence length, no padding across features
+        )
+        # Batch normalization for stabilization and faster convergence
+        self.bn1 = nn.BatchNorm2d(config.num_filters)
+        # Second convolution layer to further model interactions and temporal patterns
+        self.conv2 = nn.Conv2d(
+            in_channels=config.num_filters,
+            out_channels=config.num_filters,
+            kernel_size=(kernel_size, 1),  # Focus only on temporal patterns
+            stride=(stride, 1),
+            padding=(padding, 0)
+        )
+        # Batch normalization after second convolution
+        self.bn2 = nn.BatchNorm2d(config.num_filters)
+        # 1x1 Convolution to reduce the channel dimension back to num_seqs
+        self.conv3 = nn.Conv2d(
+            in_channels=config.num_filters,
+            out_channels=config.num_seqs,  # Back to the original number of sequences (channels)
+            kernel_size=(1, 1)
+        )
+    def forward(self, x):
+        """
+        Forward pass of the multivariate convolutional block.
+        Args:
+            x (torch.Tensor): Input tensor of shape [batch_size, num_seqs, seq_len, model_dim].
+        Returns:
+            torch.Tensor: Output tensor of shape [batch_size, num_seqs, seq_len, model_dim].
+        """
+        # Permute to [batch_size, num_seqs, seq_len, model_dim] -> [batch_size, num_seqs, model_dim, seq_len]
+        x = x.permute(0, 1, 3, 2)
+        # Apply first convolution and activation
+        x = nn.functional.relu(self.bn1(self.conv1(x)))
+        # Apply second convolution and activation
+        x = nn.functional.relu(self.bn2(self.conv2(x)))
+        # Reduce channel dimension back to num_seqs
+        x = self.conv3(x)
+        # Permute back to original shape [batch_size, num_seqs, seq_len, model_dim]
+        x = x.permute(0, 1, 3, 2)
+        return x
 class T5Attention(nn.Module):
     def __init__(self, config: T5MIMOConfig, has_relative_attention_bias=False):
         super().__init__()
         self.decoder = T5Stack(decoder_config, self.shared)
+        self.conv_block = MultivariateConvBlock(config)
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         # Initialize weights and apply final processing