gdn: add layers.py + expose layers ns (Qwen3.6 Qwen3_5{,Moe}GatedDeltaNet + Qwen3-Next GatedDeltaNet) for kernelize()

Browse files

Files changed (4) hide show

build/torch210-cxx11-cu130-aarch64-linux/layers.py +6 -3
build/torch211-cxx11-cu130-aarch64-linux/layers.py +6 -3
build/torch212-cxx11-cu130-aarch64-linux/layers.py +6 -3
build/torch212-cxx11-cu132-aarch64-linux/layers.py +6 -3

build/torch210-cxx11-cu130-aarch64-linux/layers.py CHANGED Viewed

@@ -24,7 +24,10 @@ input-projection layout, so the shared core lives in the module-level
 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
-methods.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
@@ -174,7 +177,7 @@ class GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
@@ -200,7 +203,7 @@ class Qwen3_5GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape

 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
+methods. ``_validate_layer`` also requires the layer ``forward`` signature to
+match the host's argument count exactly, so ``forward`` takes the same
+``**kwargs`` (``Unpack[TransformersKwargs]``) the host GDN layers carry in
+transformers >= 5.10; the kernel path ignores those kwargs.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape

build/torch211-cxx11-cu130-aarch64-linux/layers.py CHANGED Viewed

@@ -24,7 +24,10 @@ input-projection layout, so the shared core lives in the module-level
 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
-methods.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
@@ -174,7 +177,7 @@ class GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
@@ -200,7 +203,7 @@ class Qwen3_5GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape

 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
+methods. ``_validate_layer`` also requires the layer ``forward`` signature to
+match the host's argument count exactly, so ``forward`` takes the same
+``**kwargs`` (``Unpack[TransformersKwargs]``) the host GDN layers carry in
+transformers >= 5.10; the kernel path ignores those kwargs.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape

build/torch212-cxx11-cu130-aarch64-linux/layers.py CHANGED Viewed

@@ -24,7 +24,10 @@ input-projection layout, so the shared core lives in the module-level
 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
-methods.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
@@ -174,7 +177,7 @@ class GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
@@ -200,7 +203,7 @@ class Qwen3_5GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape

 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
+methods. ``_validate_layer`` also requires the layer ``forward`` signature to
+match the host's argument count exactly, so ``forward`` takes the same
+``**kwargs`` (``Unpack[TransformersKwargs]``) the host GDN layers carry in
+transformers >= 5.10; the kernel path ignores those kwargs.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape

build/torch212-cxx11-cu132-aarch64-linux/layers.py CHANGED Viewed

@@ -24,7 +24,10 @@ input-projection layout, so the shared core lives in the module-level
 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
-methods.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
@@ -174,7 +177,7 @@ class GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
@@ -200,7 +203,7 @@ class Qwen3_5GatedDeltaNet(nn.Module):
     has_backward: bool = False
     can_torch_compile: bool = False
-    def forward(self, hidden_states, cache_params=None, attention_mask=None):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape

 ``kernels`` forbids extra class members and a custom ``__init__`` on a layer
 (``_validate_layer``), which is why all helpers are module-level functions, not
+methods. ``_validate_layer`` also requires the layer ``forward`` signature to
+match the host's argument count exactly, so ``forward`` takes the same
+``**kwargs`` (``Unpack[TransformersKwargs]``) the host GDN layers carry in
+transformers >= 5.10; the kernel path ignores those kwargs.
 On a DGX Spark the upstream ``fla`` / ``causal_conv1d`` fast paths have no SM121
 build, so ``transformers`` silently falls back to a slow pure-torch
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states)
     has_backward: bool = False
     can_torch_compile: bool = False
+    def forward(self, hidden_states, cache_params=None, attention_mask=None, **kwargs):
         hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
         batch_size, seq_len, _ = hidden_states.shape