graft torch211-cu130 builds (legacy flash-attn signature, sm80/90/100/120)

Browse files

Files changed (9) hide show

.gitattributes +2 -0
build/torch211-cxx11-cu130-aarch64-linux/__init__.py +26 -14
build/torch211-cxx11-cu130-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so +3 -0
build/torch211-cxx11-cu130-aarch64-linux/metadata.json +3 -6
build/torch211-cxx11-cu130-x86_64-linux/__init__.py +26 -14
build/torch211-cxx11-cu130-x86_64-linux/__pycache__/__init__.cpython-312.pyc +0 -0
build/torch211-cxx11-cu130-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so +3 -0
build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc +0 -0
build/torch211-cxx11-cu130-x86_64-linux/metadata.json +3 -6

.gitattributes CHANGED Viewed

@@ -107,3 +107,5 @@ build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=
 build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

 build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

build/torch211-cxx11-cu130-aarch64-linux/__init__.py CHANGED Viewed

@@ -1,26 +1,38 @@
-import torch
-import torch.nn as nn
-from ._ops import ops
-from . import layers
-def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
-    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
-def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
-    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
-def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
-    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
-def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
-    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
 __all__ = [
-    "layers",
     "dropout_add_ln_fwd",
     "dropout_add_ln_bwd",
     "dropout_add_ln_parallel_residual_fwd",
     "dropout_add_ln_parallel_residual_bwd",
-]

+"""torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork.
+Wraps the pre-built ``dropout_layer_norm`` extension from the
+``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for
+sm_80/90/100/120) because upstream kernels-community/layer-norm only provides
+a cu129 build for torch 2.9.
+Unlike the kernels-community builds (which drop ``residual`` from the fused
+op signatures), this build keeps flash-attention's original signature with
+``residual`` as the second argument. Consumers can check ``SUPPORTS_RESIDUAL``.
+"""
+import importlib.util
+from pathlib import Path
+# True: kernels with flash-attention's original signature: fwd takes
+# (x0, residual, gamma, beta, ...) and bwd is supported.
+SUPPORTS_RESIDUAL = True
+# The extension's PyInit_* symbol is derived from the module name passed to
+# the loader, so the spec name must exactly equal the .so module name.
+_so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so"))
+_spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path)
+_ext = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_ext)
+dropout_add_ln_fwd = _ext.dropout_add_ln_fwd
+dropout_add_ln_bwd = _ext.dropout_add_ln_bwd
+dropout_add_ln_parallel_residual_fwd = _ext.dropout_add_ln_parallel_residual_fwd
+dropout_add_ln_parallel_residual_bwd = _ext.dropout_add_ln_parallel_residual_bwd
 __all__ = [
+    "SUPPORTS_RESIDUAL",
     "dropout_add_ln_fwd",
     "dropout_add_ln_bwd",
     "dropout_add_ln_parallel_residual_fwd",
     "dropout_add_ln_parallel_residual_bwd",
+]

build/torch211-cxx11-cu130-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d074846723309a009d9624dda9c55847a05299e0c9492d6e15add2c6c3eb1bb1
+size 1017902136

build/torch211-cxx11-cu130-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,17 +1,14 @@
 {
-  "name": "layer-norm",
-  "id": "_layer_norm_cuda_73ccd0c",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],
   "backend": {
     "type": "cuda",
     "archs": [
-      "10.0",
-      "12.0",
       "8.0",
-      "8.9",
-      "9.0"
     ]
   }
 }

 {
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],
   "backend": {
     "type": "cuda",
     "archs": [
       "8.0",
+      "9.0",
+      "10.0",
+      "12.0"
     ]
   }
 }

build/torch211-cxx11-cu130-x86_64-linux/__init__.py CHANGED Viewed

@@ -1,26 +1,38 @@
-import torch
-import torch.nn as nn
-from ._ops import ops
-from . import layers
-def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
-    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
-def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
-    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
-def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
-    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
-def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
-    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
 __all__ = [
-    "layers",
     "dropout_add_ln_fwd",
     "dropout_add_ln_bwd",
     "dropout_add_ln_parallel_residual_fwd",
     "dropout_add_ln_parallel_residual_bwd",
-]

+"""torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork.
+Wraps the pre-built ``dropout_layer_norm`` extension from the
+``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for
+sm_80/90/100/120) because upstream kernels-community/layer-norm only provides
+a cu129 build for torch 2.9.
+Unlike the kernels-community builds (which drop ``residual`` from the fused
+op signatures), this build keeps flash-attention's original signature with
+``residual`` as the second argument. Consumers can check ``SUPPORTS_RESIDUAL``.
+"""
+import importlib.util
+from pathlib import Path
+# True: kernels with flash-attention's original signature: fwd takes
+# (x0, residual, gamma, beta, ...) and bwd is supported.
+SUPPORTS_RESIDUAL = True
+# The extension's PyInit_* symbol is derived from the module name passed to
+# the loader, so the spec name must exactly equal the .so module name.
+_so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so"))
+_spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path)
+_ext = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_ext)
+dropout_add_ln_fwd = _ext.dropout_add_ln_fwd
+dropout_add_ln_bwd = _ext.dropout_add_ln_bwd
+dropout_add_ln_parallel_residual_fwd = _ext.dropout_add_ln_parallel_residual_fwd
+dropout_add_ln_parallel_residual_bwd = _ext.dropout_add_ln_parallel_residual_bwd
 __all__ = [
+    "SUPPORTS_RESIDUAL",
     "dropout_add_ln_fwd",
     "dropout_add_ln_bwd",
     "dropout_add_ln_parallel_residual_fwd",
     "dropout_add_ln_parallel_residual_bwd",
+]

build/torch211-cxx11-cu130-x86_64-linux/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.69 kB). View file

build/torch211-cxx11-cu130-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe763019c15a09480c5f154293f80e3d2db79f0039ac041a8d58ccf75e80eb60
+size 1021207608

build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.61 kB). View file

build/torch211-cxx11-cu130-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,17 +1,14 @@
 {
-  "name": "layer-norm",
-  "id": "_layer_norm_cuda_73ccd0c",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],
   "backend": {
     "type": "cuda",
     "archs": [
-      "10.0",
-      "12.0",
       "8.0",
-      "8.9",
-      "9.0"
     ]
   }
 }

 {
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],
   "backend": {
     "type": "cuda",
     "archs": [
       "8.0",
+      "9.0",
+      "10.0",
+      "12.0"
     ]
   }
 }