Maxtimer97
/

Llama2Hymba_raw

@@ -47,6 +47,7 @@ class HymbaConfig(PretrainedConfig):
             global_attn_idx=None,
             num_mamba=1,
             pure_attn=False,
             attn_implementation_new='sdpa',
             rope_type=None,
             attn_factor=0.5,
@@ -113,6 +114,8 @@ class HymbaConfig(PretrainedConfig):
         self.pure_attn = pure_attn
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

             global_attn_idx=None,
             num_mamba=1,
             pure_attn=False,
+            repeat_x_before_conv=True,
             attn_implementation_new='sdpa',
             rope_type=None,
             attn_factor=0.5,
         self.pure_attn = pure_attn
+        self.repeat_x_before_conv = repeat_x_before_conv
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

modeling_hymba.py CHANGED Viewed

@@ -420,8 +420,23 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
     def __init__(self, config, batch_size, dtype=torch.float16, device=None, layer_type=None):
         self.dtype = dtype
         # self.layers_block_type = config.layers_block_type
         self.has_previous_state = False  # only used by mamba
-        intermediate_size = config.mamba_expand * config.hidden_size
         ssm_state_size = config.mamba_d_state
         conv_kernel_size = config.mamba_d_conv
         self.conv_states = []
@@ -439,12 +454,12 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
                 if hasattr(config, 'conv_dim'):
                     conv_dim = config.conv_dim[str(i)]
                 else:
-                    conv_dim = intermediate_size
                 self.conv_states += [
                     torch.zeros(batch_size, conv_dim, conv_kernel_size, device=device, dtype=dtype)
                 ]
                 self.ssm_states += [
-                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
                 ]
             else:
                 self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
@@ -1592,19 +1607,30 @@ class HymbaBlock(nn.Module):
         if not self.pure_attn:
             num_ssm_param = 1
             if not hasattr(config, 'conv_dim'):
                 config.conv_dim = {str(i):0 for i in range(config.num_hidden_layers)}
-            self.conv1d = nn.Conv1d(
-                in_channels=self.xB_size,
-                out_channels=self.xB_size,
-                bias=self.use_conv_bias,
-                kernel_size=self.conv_kernel_size,
-                groups=self.xB_size,
-                padding=self.conv_kernel_size - 1
-                )
             config.conv_dim[str(self.layer_idx)] = self.xB_size
@@ -1724,7 +1750,7 @@ class HymbaBlock(nn.Module):
             index = 0
             # ssm_parameters = self.x_proj[index](hidden_states.transpose(1, 2))
-            B, C, x, timestep = torch.split(
                 hidden_states.transpose(1,2), [self.xB_size, self.intermediate_size, self.xB_size, self.time_step_rank], dim=-1
             )
@@ -1734,14 +1760,18 @@ class HymbaBlock(nn.Module):
             B = rearrange(B, "b n_group l dstate -> b n_group dstate l").contiguous()
             C = rearrange(C, "b l (n_group dstate) -> b n_group dstate l", dstate=self.ssm_state_size).contiguous()
-            x = rearrange(x, "b l d -> b d l")
-            x = rearrange(x, "b (n_group dstate) l -> b n_group l dstate", dstate=self.ssm_state_size)
-            x = repeat_kv(x, self.repeat_group)
-            x = rearrange(x, "b n_group l dstate -> b (n_group dstate) l")
             #Run convolution
             conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
             if use_precomputed_states:
                 x = causal_conv1d_update(
                     x.squeeze(-1),
@@ -1754,18 +1784,22 @@ class HymbaBlock(nn.Module):
                 cache_params.mamba_past_length[self.layer_idx] += seq_len
             else:
-                if cache_params is not None:
-                    conv_states = nn.functional.pad(
-                        x, (self.conv_kernel_size - x.shape[-1], 0)
-                    )
-                    cache_params.conv_states[self.layer_idx].copy_(conv_states)
-                    cache_params.mamba_past_length[self.layer_idx] += seq_len
                 x = causal_conv1d_fn(
                     x, conv_weights, self.conv1d.bias, activation=self.activation
                 )
             ## Handle padding for Mamba: Set padding tokens to 0
             if seq_len > 1 and attention_mask is not None and (attention_mask == 0).any():
                 x = x * attention_mask.unsqueeze(1).to(x)
@@ -1792,7 +1826,7 @@ class HymbaBlock(nn.Module):
             if use_precomputed_states:
                 scan_outputs = selective_state_update(
                     cache_params.ssm_states[self.layer_idx],
-                    x,
                     discrete_time_step,
                     A,
                     B,

     def __init__(self, config, batch_size, dtype=torch.float16, device=None, layer_type=None):
         self.dtype = dtype
         # self.layers_block_type = config.layers_block_type
+        self.pure_attn = config.pure_attn
+        if self.pure_attn:
+            self.attn_hidden_size = config.hidden_size
+            self.intermediate_size = int(config.mamba_expand * config.hidden_size)
+        else:
+            self.attn_hidden_size = int(config.hidden_size * config.attn_factor)
+            config.attn_hidden_size = self.attn_hidden_size
+            self.intermediate_size = int(config.mamba_expand * config.hidden_size * (1-config.attn_factor))
+        self.xB_size = int(config.num_key_value_heads/config.num_attention_heads * self.intermediate_size)
+        # self.num_xb_head = self.xB_size // self.ssm_state_size
         self.has_previous_state = False  # only used by mamba
         ssm_state_size = config.mamba_d_state
         conv_kernel_size = config.mamba_d_conv
         self.conv_states = []
                 if hasattr(config, 'conv_dim'):
                     conv_dim = config.conv_dim[str(i)]
                 else:
+                    conv_dim = self.xB_size
                 self.conv_states += [
                     torch.zeros(batch_size, conv_dim, conv_kernel_size, device=device, dtype=dtype)
                 ]
                 self.ssm_states += [
+                    torch.zeros(batch_size, self.intermediate_size, ssm_state_size, device=device, dtype=dtype)
                 ]
             else:
                 self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
         if not self.pure_attn:
+            self.repeat_x_before_conv = config.repeat_x_before_conv
             num_ssm_param = 1
             if not hasattr(config, 'conv_dim'):
                 config.conv_dim = {str(i):0 for i in range(config.num_hidden_layers)}
+            if self.repeat_x_before_conv:
+                self.conv1d = nn.Conv1d(
+                    in_channels=self.intermediate_size,
+                    out_channels=self.intermediate_size,
+                    bias=self.use_conv_bias,
+                    kernel_size=self.conv_kernel_size,
+                    groups=self.intermediate_size,
+                    padding=self.conv_kernel_size - 1
+                    )
+            else:
+                self.conv1d = nn.Conv1d(
+                    in_channels=self.xB_size,
+                    out_channels=self.xB_size,
+                    bias=self.use_conv_bias,
+                    kernel_size=self.conv_kernel_size,
+                    groups=self.xB_size,
+                    padding=self.conv_kernel_size - 1
+                    )
             config.conv_dim[str(self.layer_idx)] = self.xB_size
             index = 0
             # ssm_parameters = self.x_proj[index](hidden_states.transpose(1, 2))
+            B, C, x, time_step = torch.split(
                 hidden_states.transpose(1,2), [self.xB_size, self.intermediate_size, self.xB_size, self.time_step_rank], dim=-1
             )
             B = rearrange(B, "b n_group l dstate -> b n_group dstate l").contiguous()
             C = rearrange(C, "b l (n_group dstate) -> b n_group dstate l", dstate=self.ssm_state_size).contiguous()
+            x = rearrange(x, "b l d -> b d l").contiguous()
+            if self.repeat_x_before_conv:
+                # b d l
+                x = rearrange(x, "b (n_group dstate) l -> b n_group l dstate", dstate=self.ssm_state_size)
+                x = repeat_kv(x, self.repeat_group)
+                x = rearrange(x, "b n_group l dstate -> b (n_group dstate) l")
             #Run convolution
             conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
             if use_precomputed_states:
                 x = causal_conv1d_update(
                     x.squeeze(-1),
                 cache_params.mamba_past_length[self.layer_idx] += seq_len
             else:
+                # if cache_params is not None:
+                #     conv_states = nn.functional.pad(
+                #         x, (self.conv_kernel_size - x.shape[-1], 0)
+                #     )
+                #     cache_params.conv_states[self.layer_idx].copy_(conv_states)
+                #     cache_params.mamba_past_length[self.layer_idx] += seq_len
                 x = causal_conv1d_fn(
                     x, conv_weights, self.conv1d.bias, activation=self.activation
                 )
+            if not self.repeat_x_before_conv:
+                x = rearrange(x, "b (n_group dstate) l -> b n_group l dstate", dstate=self.ssm_state_size)
+                x = repeat_kv(x, self.repeat_group)
+                x = rearrange(x, "b n_group l dstate -> b (n_group dstate) l")
             ## Handle padding for Mamba: Set padding tokens to 0
             if seq_len > 1 and attention_mask is not None and (attention_mask == 0).any():
                 x = x * attention_mask.unsqueeze(1).to(x)
             if use_precomputed_states:
                 scan_outputs = selective_state_update(
                     cache_params.ssm_states[self.layer_idx],
+                    x.squeeze(),
                     discrete_time_step,
                     A,
                     B,