Kernels
gueraf commited on
Commit
ed7cbde
·
verified ·
1 Parent(s): 81cdecb

graft torch211-cu130 builds (legacy flash-attn signature, sm80/90/100/120)

Browse files
.gitattributes CHANGED
@@ -107,3 +107,5 @@ build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=
107
  build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
108
  build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
109
  build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 
 
 
107
  build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
108
  build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
109
  build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
110
+ build/torch211-cxx11-cu130-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
111
+ build/torch211-cxx11-cu130-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
build/torch211-cxx11-cu130-aarch64-linux/__init__.py CHANGED
@@ -1,26 +1,38 @@
1
- import torch
2
- import torch.nn as nn
3
 
4
- from ._ops import ops
 
 
 
5
 
6
- from . import layers
 
 
 
7
 
8
- def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
9
- return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
10
 
11
- def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
12
- return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
 
13
 
14
- def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
15
- return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
 
 
 
 
16
 
17
- def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
18
- return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
 
 
19
 
20
  __all__ = [
21
- "layers",
22
  "dropout_add_ln_fwd",
23
  "dropout_add_ln_bwd",
24
  "dropout_add_ln_parallel_residual_fwd",
25
  "dropout_add_ln_parallel_residual_bwd",
26
- ]
 
1
+ """torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork.
 
2
 
3
+ Wraps the pre-built ``dropout_layer_norm`` extension from the
4
+ ``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for
5
+ sm_80/90/100/120) because upstream kernels-community/layer-norm only provides
6
+ a cu129 build for torch 2.9.
7
 
8
+ Unlike the kernels-community builds (which drop ``residual`` from the fused
9
+ op signatures), this build keeps flash-attention's original signature with
10
+ ``residual`` as the second argument. Consumers can check ``SUPPORTS_RESIDUAL``.
11
+ """
12
 
13
+ import importlib.util
14
+ from pathlib import Path
15
 
16
+ # True: kernels with flash-attention's original signature: fwd takes
17
+ # (x0, residual, gamma, beta, ...) and bwd is supported.
18
+ SUPPORTS_RESIDUAL = True
19
 
20
+ # The extension's PyInit_* symbol is derived from the module name passed to
21
+ # the loader, so the spec name must exactly equal the .so module name.
22
+ _so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so"))
23
+ _spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path)
24
+ _ext = importlib.util.module_from_spec(_spec)
25
+ _spec.loader.exec_module(_ext)
26
 
27
+ dropout_add_ln_fwd = _ext.dropout_add_ln_fwd
28
+ dropout_add_ln_bwd = _ext.dropout_add_ln_bwd
29
+ dropout_add_ln_parallel_residual_fwd = _ext.dropout_add_ln_parallel_residual_fwd
30
+ dropout_add_ln_parallel_residual_bwd = _ext.dropout_add_ln_parallel_residual_bwd
31
 
32
  __all__ = [
33
+ "SUPPORTS_RESIDUAL",
34
  "dropout_add_ln_fwd",
35
  "dropout_add_ln_bwd",
36
  "dropout_add_ln_parallel_residual_fwd",
37
  "dropout_add_ln_parallel_residual_bwd",
38
+ ]
build/torch211-cxx11-cu130-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d074846723309a009d9624dda9c55847a05299e0c9492d6e15add2c6c3eb1bb1
3
+ size 1017902136
build/torch211-cxx11-cu130-aarch64-linux/metadata.json CHANGED
@@ -1,17 +1,14 @@
1
  {
2
- "name": "layer-norm",
3
- "id": "_layer_norm_cuda_73ccd0c",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
7
  "backend": {
8
  "type": "cuda",
9
  "archs": [
10
- "10.0",
11
- "12.0",
12
  "8.0",
13
- "8.9",
14
- "9.0"
 
15
  ]
16
  }
17
  }
 
1
  {
 
 
2
  "version": 1,
3
  "license": "BSD-3-Clause",
4
  "python-depends": [],
5
  "backend": {
6
  "type": "cuda",
7
  "archs": [
 
 
8
  "8.0",
9
+ "9.0",
10
+ "10.0",
11
+ "12.0"
12
  ]
13
  }
14
  }
build/torch211-cxx11-cu130-x86_64-linux/__init__.py CHANGED
@@ -1,26 +1,38 @@
1
- import torch
2
- import torch.nn as nn
3
 
4
- from ._ops import ops
 
 
 
5
 
6
- from . import layers
 
 
 
7
 
8
- def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
9
- return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
10
 
11
- def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
12
- return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
 
13
 
14
- def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
15
- return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
 
 
 
 
16
 
17
- def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
18
- return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
 
 
19
 
20
  __all__ = [
21
- "layers",
22
  "dropout_add_ln_fwd",
23
  "dropout_add_ln_bwd",
24
  "dropout_add_ln_parallel_residual_fwd",
25
  "dropout_add_ln_parallel_residual_bwd",
26
- ]
 
1
+ """torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork.
 
2
 
3
+ Wraps the pre-built ``dropout_layer_norm`` extension from the
4
+ ``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for
5
+ sm_80/90/100/120) because upstream kernels-community/layer-norm only provides
6
+ a cu129 build for torch 2.9.
7
 
8
+ Unlike the kernels-community builds (which drop ``residual`` from the fused
9
+ op signatures), this build keeps flash-attention's original signature with
10
+ ``residual`` as the second argument. Consumers can check ``SUPPORTS_RESIDUAL``.
11
+ """
12
 
13
+ import importlib.util
14
+ from pathlib import Path
15
 
16
+ # True: kernels with flash-attention's original signature: fwd takes
17
+ # (x0, residual, gamma, beta, ...) and bwd is supported.
18
+ SUPPORTS_RESIDUAL = True
19
 
20
+ # The extension's PyInit_* symbol is derived from the module name passed to
21
+ # the loader, so the spec name must exactly equal the .so module name.
22
+ _so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so"))
23
+ _spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path)
24
+ _ext = importlib.util.module_from_spec(_spec)
25
+ _spec.loader.exec_module(_ext)
26
 
27
+ dropout_add_ln_fwd = _ext.dropout_add_ln_fwd
28
+ dropout_add_ln_bwd = _ext.dropout_add_ln_bwd
29
+ dropout_add_ln_parallel_residual_fwd = _ext.dropout_add_ln_parallel_residual_fwd
30
+ dropout_add_ln_parallel_residual_bwd = _ext.dropout_add_ln_parallel_residual_bwd
31
 
32
  __all__ = [
33
+ "SUPPORTS_RESIDUAL",
34
  "dropout_add_ln_fwd",
35
  "dropout_add_ln_bwd",
36
  "dropout_add_ln_parallel_residual_fwd",
37
  "dropout_add_ln_parallel_residual_bwd",
38
+ ]
build/torch211-cxx11-cu130-x86_64-linux/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.69 kB). View file
 
build/torch211-cxx11-cu130-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe763019c15a09480c5f154293f80e3d2db79f0039ac041a8d58ccf75e80eb60
3
+ size 1021207608
build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.61 kB). View file
 
build/torch211-cxx11-cu130-x86_64-linux/metadata.json CHANGED
@@ -1,17 +1,14 @@
1
  {
2
- "name": "layer-norm",
3
- "id": "_layer_norm_cuda_73ccd0c",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
7
  "backend": {
8
  "type": "cuda",
9
  "archs": [
10
- "10.0",
11
- "12.0",
12
  "8.0",
13
- "8.9",
14
- "9.0"
 
15
  ]
16
  }
17
  }
 
1
  {
 
 
2
  "version": 1,
3
  "license": "BSD-3-Clause",
4
  "python-depends": [],
5
  "backend": {
6
  "type": "cuda",
7
  "archs": [
 
 
8
  "8.0",
9
+ "9.0",
10
+ "10.0",
11
+ "12.0"
12
  ]
13
  }
14
  }