Uploaded using `kernel-builder`.

Files changed (12) hide show

build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_85c21a0.abi3.so → _flash_attn2_cpu_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:adbb0be576fc0c6f55a884925959b5064e1c4d2b2969b71ea14b12d59f5ead5a
 size 1942240

 version https://git-lfs.github.com/spec/v1
+oid sha256:4c84eb7a0c9bdf71b230c454cc95bebdbb85f3ba6d5bcc6225bcae299acbfef5
 size 1942240

build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cpu_85c21a0
-ops = torch.ops._flash_attn2_cpu_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cpu_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cpu_042c80b
+ops = torch.ops._flash_attn2_cpu_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cpu_042c80b::{op_name}"

build/torch210-cxx11-cpu-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch210-cxx11-cpu-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cpu_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cpu_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch211-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_85c21a0.abi3.so → _flash_attn2_cpu_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bcd2e125a7a3a2deafeaa924e01c39305d5cd87707df7edfcd7e6229fe49c68d
 size 1942240

 version https://git-lfs.github.com/spec/v1
+oid sha256:149707762062fc9ad367d0395126db2bee01fdf8cc1b9f2677e147529dfc5734
 size 1942240

build/torch211-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cpu_85c21a0
-ops = torch.ops._flash_attn2_cpu_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cpu_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cpu_042c80b
+ops = torch.ops._flash_attn2_cpu_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cpu_042c80b::{op_name}"

build/torch211-cxx11-cpu-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch211-cxx11-cpu-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cpu_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cpu_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

build/torch212-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_85c21a0.abi3.so → _flash_attn2_cpu_042c80b.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d087d91e9fdb319aac66e3475d7c5456e380ab8acdd66fed2d5e41602994d5ea
 size 1942272

 version https://git-lfs.github.com/spec/v1
+oid sha256:5939950002f1e383de6d52c61b7b8c79e563b56398038c408a262160f9a36130
 size 1942272

build/torch212-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_cpu_85c21a0
-ops = torch.ops._flash_attn2_cpu_85c21a0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_cpu_85c21a0::{op_name}"

 import torch
+from . import _flash_attn2_cpu_042c80b
+ops = torch.ops._flash_attn2_cpu_042c80b
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_cpu_042c80b::{op_name}"

build/torch212-cxx11-cpu-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     assert head_dim <= 256
     if device.type == "xpu":
-        return 64
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

     assert head_dim <= 256
     if device.type == "xpu":
+        if head_dim <= 96:
+            return 64
+        elif head_dim <= 128:
+            return 32
+        elif head_dim <= 256:
+            return 64
+        else:
+            return 32
     # This should match the block sizes in the CUDA kernel
     major, minor = torch.cuda.get_device_capability(device)

build/torch212-cxx11-cpu-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "flash-attn2",
-  "id": "_flash_attn2_cpu_85c21a0",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],

 {
   "name": "flash-attn2",
+  "id": "_flash_attn2_cpu_042c80b",
   "version": 1,
   "license": "BSD-3-Clause",
   "python-depends": [],