Kernels:

kernels-community
/

flash-attn2

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on 10 days ago

Commit

5dde0f3

verified ·

1 Parent(s): 19b59e6

Uploaded using `kernel-builder`.

Browse files

Files changed (36) hide show

build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch210-cxx11-cu126-aarch64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py +8 -1
build/torch210-cxx11-cu126-aarch64-linux/metadata.json +1 -1
build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch210-cxx11-cu128-aarch64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py +8 -1
build/torch210-cxx11-cu128-aarch64-linux/metadata.json +1 -1
build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch210-cxx11-cu130-aarch64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py +8 -1
build/torch210-cxx11-cu130-aarch64-linux/metadata.json +1 -1
build/torch211-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch211-cxx11-cu126-aarch64-linux/_ops.py +3 -3
build/torch211-cxx11-cu126-aarch64-linux/flash_attn_interface.py +8 -1
build/torch211-cxx11-cu126-aarch64-linux/metadata.json +1 -1
build/torch211-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch211-cxx11-cu128-aarch64-linux/_ops.py +3 -3
build/torch211-cxx11-cu128-aarch64-linux/flash_attn_interface.py +8 -1
build/torch211-cxx11-cu128-aarch64-linux/metadata.json +1 -1
build/torch211-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch211-cxx11-cu130-aarch64-linux/_ops.py +3 -3
build/torch211-cxx11-cu130-aarch64-linux/flash_attn_interface.py +8 -1
build/torch211-cxx11-cu130-aarch64-linux/metadata.json +1 -1
build/torch212-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch212-cxx11-cu126-aarch64-linux/_ops.py +3 -3
build/torch212-cxx11-cu126-aarch64-linux/flash_attn_interface.py +8 -1
build/torch212-cxx11-cu126-aarch64-linux/metadata.json +1 -1
build/torch212-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch212-cxx11-cu130-aarch64-linux/_ops.py +3 -3
build/torch212-cxx11-cu130-aarch64-linux/flash_attn_interface.py +8 -1
build/torch212-cxx11-cu130-aarch64-linux/metadata.json +1 -1
build/torch212-cxx11-cu132-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch212-cxx11-cu132-aarch64-linux/_ops.py +3 -3
build/torch212-cxx11-cu132-aarch64-linux/flash_attn_interface.py +8 -1
build/torch212-cxx11-cu132-aarch64-linux/metadata.json +1 -1

build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5d9c0bbaf577980d0ec6fee304bb99b832bdab1761f89af0f045f806d9a0528
 size 448608936

 version https://git-lfs.github.com/spec/v1
+oid sha256:bb7a8f0c2e6f2c42e57c0cd9e1f9d9857247692b13ca2151eda741c13c803edb
 size 448608936

build/torch210-cxx11-cu126-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch210-cxx11-cu126-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df49147950da73dbecb34b0fe2882471704c2668112af4d0cd4c783be1e01463
 size 1038067096

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffb8774ec4639852aa2feaf1fc00670950d0bc394be16ed518d684aa76f752c4
 size 1038067096

build/torch210-cxx11-cu128-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch210-cxx11-cu128-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:664859f216670f843a3e6483878a62586968be762becca2562c46ebc2a678151
 size 1008655376

 version https://git-lfs.github.com/spec/v1
+oid sha256:acfd5388e8adc2ed8b5fdfafb85350d9e99de45f43ee597d3e8fbbd95dc4be9a
 size 1008655376

build/torch210-cxx11-cu130-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch210-cxx11-cu130-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch211-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d7dcc7af49adf3f3b5a4ad7d265e86be1dc5d2b33082f872166f1dba0103a836
 size 448605008

 version https://git-lfs.github.com/spec/v1
+oid sha256:0704852add679053a5a2f833eac909255535d77f3a98f50d4e1dbb14e65f65fe
 size 448605008

build/torch211-cxx11-cu126-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch211-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch211-cxx11-cu126-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch211-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33ccbc5375ca903c01cd3b2720516be71dc12792fbf2f11397a1f7c1fa589038
 size 1037997832

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4f80bd7ff4850910138841e07c879f89ee09eb09e2bdf2ccaa04aa099896153
 size 1037997832

build/torch211-cxx11-cu128-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch211-cxx11-cu128-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch211-cxx11-cu128-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch211-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b7f23eaf5cc0d69a1bb4b021b1401b1da1f48f4d6302d07eb7632e9d0444ea9
 size 1008651464

 version https://git-lfs.github.com/spec/v1
+oid sha256:3e9d9856108fed89756cca5b71e00ae80b4d280a4a27a4cebffdfc059a242411
 size 1008651464

build/torch211-cxx11-cu130-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch211-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch211-cxx11-cu130-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch212-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc6154096b681b4a88167b3d0a3822b56460a047e87cd178c5b7160e33ad0929
 size 448533000

 version https://git-lfs.github.com/spec/v1
+oid sha256:5eed49fc0aee64c73429c5042b59bd8b98ce08ffcc0d3f005b93af74de38c642
 size 448533000

build/torch212-cxx11-cu126-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch212-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch212-cxx11-cu126-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch212-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8f799b26743dfa81f06076108d96f800c5778f57d3561c261a5b940682129604
 size 1008645144

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ea3ab6f8a78adbd4ec863a1b630cbff7fc51b70bdcf89f5a88274badc6c262c
 size 1008645144

build/torch212-cxx11-cu130-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch212-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch212-cxx11-cu130-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch212-cxx11-cu132-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:783f89cd295676fd01ea025a5caf3ab14df2752cd42ace7b29fa9ae6cf4526af
 size 1026021240

 version https://git-lfs.github.com/spec/v1
+oid sha256:7d0adef32347b87be95b414f9fc888c90bf67859b0c87893143e540a96cd307f
 size 1026021240

build/torch212-cxx11-cu132-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch212-cxx11-cu132-aarch64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch212-cxx11-cu132-aarch64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],