Upload folder using huggingface_hub

Files changed (8) hide show

__pycache__/configuration_jet_nemotron.cpython-310.pyc ADDED Viewed

Binary file (8.29 kB). View file

__pycache__/dconv_fwd_cache.cpython-310.pyc ADDED Viewed

Binary file (6.95 kB). View file

__pycache__/dconv_fwdbwd.cpython-310.pyc ADDED Viewed

Binary file (6.49 kB). View file

__pycache__/dconv_step.cpython-310.pyc ADDED Viewed

Binary file (4.21 kB). View file

__pycache__/dynamic_conv.cpython-310.pyc ADDED Viewed

Binary file (7.45 kB). View file

__pycache__/jet_block.cpython-310.pyc ADDED Viewed

Binary file (7.07 kB). View file

__pycache__/kv_cache.cpython-310.pyc ADDED Viewed

Binary file (6.23 kB). View file

jet_block.py CHANGED Viewed

@@ -46,7 +46,7 @@ class JetBlockConfig():
     head_dim: int = 256
     norm_eps: float = 1e-5
     conv_size: int = 4
-    dconv_generator_reduction: int = None
     dconv_implementation: str = 'triton'
@@ -180,24 +180,21 @@ class JetBlock(nn.Module):
         if attention_mask is not None and q_len > 1:
             indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
-        conv_state = None
         conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
         q = F.silu(self.q_proj(hidden_states))
         k = F.silu(self.k_proj(hidden_states))
-        conv_state_v = None
         if last_state is not None:
-            conv_state_v = last_state['conv_state'][-1]
-        v, conv_state_v = self.dynamic_conv1d(
             x=self.v_proj(hidden_states),
             generator_input=hidden_states,
             mask=conv_mask,
-            cache=conv_state_v,
             output_final_state=use_cache,
         )
-        conv_state = conv_state + (conv_state_v,) if conv_state is not None else (conv_state_v,)
         if attention_mask is not None and q_len > 1:
             q = index_first_axis(rearrange(q, "b s ... -> (b s) ..."), indices).unsqueeze(0)

     head_dim: int = 256
     norm_eps: float = 1e-5
     conv_size: int = 4
+    dconv_generator_reduction: int = 8
     dconv_implementation: str = 'triton'
         if attention_mask is not None and q_len > 1:
             indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
         conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
         q = F.silu(self.q_proj(hidden_states))
         k = F.silu(self.k_proj(hidden_states))
+        conv_state = None
         if last_state is not None:
+            conv_state = last_state['conv_state']
+        v, conv_state = self.dynamic_conv1d(
             x=self.v_proj(hidden_states),
             generator_input=hidden_states,
             mask=conv_mask,
+            cache=conv_state,
             output_final_state=use_cache,
         )
         if attention_mask is not None and q_len > 1:
             q = index_first_axis(rearrange(q, "b s ... -> (b s) ..."), indices).unsqueeze(0)