kernels-bot commited on
Commit
c7ac133
·
verified ·
1 Parent(s): 5dde0f3

Uploaded using `kernel-builder`.

Browse files
Files changed (36) hide show
  1. build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  2. build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  3. build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
  4. build/torch210-cxx11-cu126-x86_64-linux/metadata.json +1 -1
  5. build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  6. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  7. build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py +8 -1
  8. build/torch210-cxx11-cu128-x86_64-linux/metadata.json +1 -1
  9. build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  10. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  11. build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
  12. build/torch210-cxx11-cu130-x86_64-linux/metadata.json +1 -1
  13. build/torch211-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  14. build/torch211-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  15. build/torch211-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
  16. build/torch211-cxx11-cu126-x86_64-linux/metadata.json +1 -1
  17. build/torch211-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  18. build/torch211-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  19. build/torch211-cxx11-cu128-x86_64-linux/flash_attn_interface.py +8 -1
  20. build/torch211-cxx11-cu128-x86_64-linux/metadata.json +1 -1
  21. build/torch211-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  22. build/torch211-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  23. build/torch211-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
  24. build/torch211-cxx11-cu130-x86_64-linux/metadata.json +1 -1
  25. build/torch212-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  26. build/torch212-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  27. build/torch212-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
  28. build/torch212-cxx11-cu126-x86_64-linux/metadata.json +1 -1
  29. build/torch212-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  30. build/torch212-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  31. build/torch212-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
  32. build/torch212-cxx11-cu130-x86_64-linux/metadata.json +1 -1
  33. build/torch212-cxx11-cu132-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
  34. build/torch212-cxx11-cu132-x86_64-linux/_ops.py +3 -3
  35. build/torch212-cxx11-cu132-x86_64-linux/flash_attn_interface.py +8 -1
  36. build/torch212-cxx11-cu132-x86_64-linux/metadata.json +1 -1
build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5c8f7ff6b43d2ae3a480f84055423f058f1b6777760dd9805bd90c2f760afc2
3
  size 448709080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfe72b621f62710dcc54348e86bc32386e2d51efd3da741ba07bd16007add8a0
3
  size 448709080
build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch210-cxx11-cu126-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe6f513da6f0e7288453be6b1fa03ce5f3e3573481de93e0fff5b19f8cb8ba38
3
  size 1037795600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc937bef8777e779386411a3154b639da575a5485001e19fc602bd9cfe5c109c
3
  size 1037795600
build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch210-cxx11-cu128-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1c149903f3bb96376c552c9a5aca20edfc8090db2b92b722a95e10b15aef4bb
3
  size 1008994200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d852266c6061ece6f6bb7a29f7107cf838636bc82432fef791dac810642ffeb
3
  size 1008994200
build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch210-cxx11-cu130-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch211-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6e9e6150d0e7084ff6c7813a3bae828f60a1bda15d9333107a9f089c30b9ab6
3
  size 448697832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f62c82748346531b0e88011c252b2688bff4add42993ac6ac28b891a35f565c
3
  size 448697832
build/torch211-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch211-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch211-cxx11-cu126-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch211-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce1c4a6ee759f339ef53b41086bee97f5d26701be9f28e63f42b5dd443f228b2
3
  size 1037788592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7006b25ac0608ec8b1da90962070bd77efc163a221bc056172ff2f2f0b6e2c81
3
  size 1037788592
build/torch211-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch211-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch211-cxx11-cu128-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch211-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20663e8a09afea638c02f61f6f75325c3916b755073020dfa0724b5937910430
3
  size 1008987104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf74261ec520a75d22aa355e834ca33ded833ae53f859e90f882672f988eff49
3
  size 1008987104
build/torch211-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch211-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch211-cxx11-cu130-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch212-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68669e1967cc7c453b79172e86f70164764e08e2eefc7473e944f86a2c8c63fe
3
  size 448703856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55434e602df428ddc3780611d5358c7ff009159c3b66a065f7941446f51ea825
3
  size 448703856
build/torch212-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch212-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch212-cxx11-cu126-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch212-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9e84c85b64d504659a615177c9ee58e77b16ef2f3ed93879007a84ede1370c1
3
  size 1008988960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59aea46816c69a682c3f1796df10140ae3ac7df1e8b3a742d4bf716a18c955f7
3
  size 1008988960
build/torch212-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch212-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch212-cxx11-cu130-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
build/torch212-cxx11-cu132-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abb8f6ea71a538d69b0b2ad7e947a75d8647a14b7c572c7738097cb9b04aa53f
3
  size 1026378680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66999c12ec5a807e9160409ac09ad056f01ee2b60217df2088b5dc13bcb4cbf4
3
  size 1026378680
build/torch212-cxx11-cu132-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_85c21a0
3
- ops = torch.ops._flash_attn2_cuda_85c21a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_85c21a0::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_042c80b
3
+ ops = torch.ops._flash_attn2_cuda_042c80b
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_042c80b::{op_name}"
build/torch212-cxx11-cu132-x86_64-linux/flash_attn_interface.py CHANGED
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
- return 64
 
 
 
 
 
 
 
40
 
41
  # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
 
36
  assert head_dim <= 256
37
 
38
  if device.type == "xpu":
39
+ if head_dim <= 96:
40
+ return 64
41
+ elif head_dim <= 128:
42
+ return 32
43
+ elif head_dim <= 256:
44
+ return 64
45
+ else:
46
+ return 32
47
 
48
  # This should match the block sizes in the CUDA kernel
49
  major, minor = torch.cuda.get_device_capability(device)
build/torch212-cxx11-cu132-x86_64-linux/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "flash-attn2",
3
- "id": "_flash_attn2_cuda_85c21a0",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],
 
1
  {
2
  "name": "flash-attn2",
3
+ "id": "_flash_attn2_cuda_042c80b",
4
  "version": 1,
5
  "license": "BSD-3-Clause",
6
  "python-depends": [],