Kernels:

kernels-community
/

flash-attn2

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on 5 days ago

Commit

c7ac133

verified ·

1 Parent(s): 5dde0f3

Uploaded using `kernel-builder`.

Browse files

Files changed (36) hide show

build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +1 -1
build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py +8 -1
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +1 -1
build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
build/torch210-cxx11-cu130-x86_64-linux/metadata.json +1 -1
build/torch211-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch211-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch211-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
build/torch211-cxx11-cu126-x86_64-linux/metadata.json +1 -1
build/torch211-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch211-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch211-cxx11-cu128-x86_64-linux/flash_attn_interface.py +8 -1
build/torch211-cxx11-cu128-x86_64-linux/metadata.json +1 -1
build/torch211-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch211-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch211-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
build/torch211-cxx11-cu130-x86_64-linux/metadata.json +1 -1
build/torch212-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch212-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch212-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
build/torch212-cxx11-cu126-x86_64-linux/metadata.json +1 -1
build/torch212-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch212-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch212-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
build/torch212-cxx11-cu130-x86_64-linux/metadata.json +1 -1
build/torch212-cxx11-cu132-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
build/torch212-cxx11-cu132-x86_64-linux/_ops.py +3 -3
build/torch212-cxx11-cu132-x86_64-linux/flash_attn_interface.py +8 -1
build/torch212-cxx11-cu132-x86_64-linux/metadata.json +1 -1

build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5c8f7ff6b43d2ae3a480f84055423f058f1b6777760dd9805bd90c2f760afc2
 size 448709080

 version https://git-lfs.github.com/spec/v1
+oid sha256:dfe72b621f62710dcc54348e86bc32386e2d51efd3da741ba07bd16007add8a0
 size 448709080

build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch210-cxx11-cu126-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe6f513da6f0e7288453be6b1fa03ce5f3e3573481de93e0fff5b19f8cb8ba38
 size 1037795600

 version https://git-lfs.github.com/spec/v1
+oid sha256:bc937bef8777e779386411a3154b639da575a5485001e19fc602bd9cfe5c109c
 size 1037795600

build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch210-cxx11-cu128-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1c149903f3bb96376c552c9a5aca20edfc8090db2b92b722a95e10b15aef4bb
 size 1008994200

 version https://git-lfs.github.com/spec/v1
+oid sha256:8d852266c6061ece6f6bb7a29f7107cf838636bc82432fef791dac810642ffeb
 size 1008994200

build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch210-cxx11-cu130-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch211-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6e9e6150d0e7084ff6c7813a3bae828f60a1bda15d9333107a9f089c30b9ab6
 size 448697832

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f62c82748346531b0e88011c252b2688bff4add42993ac6ac28b891a35f565c
 size 448697832

build/torch211-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch211-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch211-cxx11-cu126-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch211-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce1c4a6ee759f339ef53b41086bee97f5d26701be9f28e63f42b5dd443f228b2
 size 1037788592

 version https://git-lfs.github.com/spec/v1
+oid sha256:7006b25ac0608ec8b1da90962070bd77efc163a221bc056172ff2f2f0b6e2c81
 size 1037788592

build/torch211-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch211-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch211-cxx11-cu128-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch211-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20663e8a09afea638c02f61f6f75325c3916b755073020dfa0724b5937910430
 size 1008987104

 version https://git-lfs.github.com/spec/v1
+oid sha256:cf74261ec520a75d22aa355e834ca33ded833ae53f859e90f882672f988eff49
 size 1008987104

build/torch211-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch211-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch211-cxx11-cu130-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch212-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68669e1967cc7c453b79172e86f70164764e08e2eefc7473e944f86a2c8c63fe
 size 448703856

 version https://git-lfs.github.com/spec/v1
+oid sha256:55434e602df428ddc3780611d5358c7ff009159c3b66a065f7941446f51ea825
 size 448703856

build/torch212-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch212-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch212-cxx11-cu126-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch212-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f9e84c85b64d504659a615177c9ee58e77b16ef2f3ed93879007a84ede1370c1
 size 1008988960

 version https://git-lfs.github.com/spec/v1
+oid sha256:59aea46816c69a682c3f1796df10140ae3ac7df1e8b3a742d4bf716a18c955f7
 size 1008988960

build/torch212-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch212-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch212-cxx11-cu130-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch212-cxx11-cu132-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:abb8f6ea71a538d69b0b2ad7e947a75d8647a14b7c572c7738097cb9b04aa53f
 size 1026378680

 version https://git-lfs.github.com/spec/v1
+oid sha256:66999c12ec5a807e9160409ac09ad056f01ee2b60217df2088b5dc13bcb4cbf4
 size 1026378680

build/torch212-cxx11-cu132-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cuda_85c21a0
-ops = torch.ops._flash_attn2_cuda_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cuda_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cuda_042c80b
+ops = torch.ops._flash_attn2_cuda_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cuda_042c80b::{op_name}"

build/torch212-cxx11-cu132-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch212-cxx11-cu132-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cuda_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cuda_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],