kernels-bot commited on
Commit
5dde0f3
·
verified ·
1 Parent(s): 19b59e6

Uploaded using `kernel-builder`.

Browse files
Files changed (36) hide show
  1. build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  2. build/torch210-cxx11-cu126-aarch64-linux/_ops.py +3 -3
  3. build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py +8 -1
  4. build/torch210-cxx11-cu126-aarch64-linux/metadata.json +1 -1
  5. build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  6. build/torch210-cxx11-cu128-aarch64-linux/_ops.py +3 -3
  7. build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py +8 -1
  8. build/torch210-cxx11-cu128-aarch64-linux/metadata.json +1 -1
  9. build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  10. build/torch210-cxx11-cu130-aarch64-linux/_ops.py +3 -3
  11. build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py +8 -1
  12. build/torch210-cxx11-cu130-aarch64-linux/metadata.json +1 -1
  13. build/torch211-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  14. build/torch211-cxx11-cu126-aarch64-linux/_ops.py +3 -3
  15. build/torch211-cxx11-cu126-aarch64-linux/flash_attn_interface.py +8 -1
  16. build/torch211-cxx11-cu126-aarch64-linux/metadata.json +1 -1
  17. build/torch211-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  18. build/torch211-cxx11-cu128-aarch64-linux/_ops.py +3 -3
  19. build/torch211-cxx11-cu128-aarch64-linux/flash_attn_interface.py +8 -1
  20. build/torch211-cxx11-cu128-aarch64-linux/metadata.json +1 -1
  21. build/torch211-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  22. build/torch211-cxx11-cu130-aarch64-linux/_ops.py +3 -3
  23. build/torch211-cxx11-cu130-aarch64-linux/flash_attn_interface.py +8 -1
  24. build/torch211-cxx11-cu130-aarch64-linux/metadata.json +1 -1
  25. build/torch212-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  26. build/torch212-cxx11-cu126-aarch64-linux/_ops.py +3 -3
  27. build/torch212-cxx11-cu126-aarch64-linux/flash_attn_interface.py +8 -1
  28. build/torch212-cxx11-cu126-aarch64-linux/metadata.json +1 -1
  29. build/torch212-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  30. build/torch212-cxx11-cu130-aarch64-linux/_ops.py +3 -3
  31. build/torch212-cxx11-cu130-aarch64-linux/flash_attn_interface.py +8 -1
  32. build/torch212-cxx11-cu130-aarch64-linux/metadata.json +1 -1
  33. build/torch212-cxx11-cu132-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  34. build/torch212-cxx11-cu132-aarch64-linux/_ops.py +3 -3
  35. build/torch212-cxx11-cu132-aarch64-linux/flash_attn_interface.py +8 -1
  36. build/torch212-cxx11-cu132-aarch64-linux/metadata.json +1 -1
build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5d9c0bbaf577980d0ec6fee304bb99b832bdab1761f89af0f045f806d9a0528
3
  size 448608936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb7a8f0c2e6f2c42e57c0cd9e1f9d9857247692b13ca2151eda741c13c803edb
3
  size 448608936
build/torch210-cxx11-cu126-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch210-cxx11-cu126-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df49147950da73dbecb34b0fe2882471704c2668112af4d0cd4c783be1e01463
3
  size 1038067096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb8774ec4639852aa2feaf1fc00670950d0bc394be16ed518d684aa76f752c4
3
  size 1038067096
build/torch210-cxx11-cu128-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch210-cxx11-cu128-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:664859f216670f843a3e6483878a62586968be762becca2562c46ebc2a678151
3
  size 1008655376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acfd5388e8adc2ed8b5fdfafb85350d9e99de45f43ee597d3e8fbbd95dc4be9a
3
  size 1008655376
build/torch210-cxx11-cu130-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch210-cxx11-cu130-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch211-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7dcc7af49adf3f3b5a4ad7d265e86be1dc5d2b33082f872166f1dba0103a836
3
  size 448605008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0704852add679053a5a2f833eac909255535d77f3a98f50d4e1dbb14e65f65fe
3
  size 448605008
build/torch211-cxx11-cu126-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch211-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch211-cxx11-cu126-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch211-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33ccbc5375ca903c01cd3b2720516be71dc12792fbf2f11397a1f7c1fa589038
3
  size 1037997832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4f80bd7ff4850910138841e07c879f89ee09eb09e2bdf2ccaa04aa099896153
3
  size 1037997832
build/torch211-cxx11-cu128-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch211-cxx11-cu128-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch211-cxx11-cu128-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch211-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b7f23eaf5cc0d69a1bb4b021b1401b1da1f48f4d6302d07eb7632e9d0444ea9
3
  size 1008651464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9d9856108fed89756cca5b71e00ae80b4d280a4a27a4cebffdfc059a242411
3
  size 1008651464
build/torch211-cxx11-cu130-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch211-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch211-cxx11-cu130-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch212-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc6154096b681b4a88167b3d0a3822b56460a047e87cd178c5b7160e33ad0929
3
  size 448533000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eed49fc0aee64c73429c5042b59bd8b98ce08ffcc0d3f005b93af74de38c642
3
  size 448533000
build/torch212-cxx11-cu126-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch212-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch212-cxx11-cu126-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch212-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f799b26743dfa81f06076108d96f800c5778f57d3561c261a5b940682129604
3
  size 1008645144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ea3ab6f8a78adbd4ec863a1b630cbff7fc51b70bdcf89f5a88274badc6c262c
3
  size 1008645144
build/torch212-cxx11-cu130-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch212-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch212-cxx11-cu130-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch212-cxx11-cu132-aarch64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:783f89cd295676fd01ea025a5caf3ab14df2752cd42ace7b29fa9ae6cf4526af
3
  size 1026021240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d0adef32347b87be95b414f9fc888c90bf67859b0c87893143e540a96cd307f
3
  size 1026021240
build/torch212-cxx11-cu132-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch212-cxx11-cu132-aarch64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch212-cxx11-cu132-aarch64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],