Kernels:
Trusted publisher
Uploaded using `kernel-builder`.
Browse files- build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch210-cxx11-cu126-x86_64-linux/metadata.json +1 -1
- build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch210-cxx11-cu128-x86_64-linux/metadata.json +1 -1
- build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch210-cxx11-cu130-x86_64-linux/metadata.json +1 -1
- build/torch211-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch211-cxx11-cu126-x86_64-linux/_ops.py +3 -3
- build/torch211-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch211-cxx11-cu126-x86_64-linux/metadata.json +1 -1
- build/torch211-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch211-cxx11-cu128-x86_64-linux/_ops.py +3 -3
- build/torch211-cxx11-cu128-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch211-cxx11-cu128-x86_64-linux/metadata.json +1 -1
- build/torch211-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch211-cxx11-cu130-x86_64-linux/_ops.py +3 -3
- build/torch211-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch211-cxx11-cu130-x86_64-linux/metadata.json +1 -1
- build/torch212-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch212-cxx11-cu126-x86_64-linux/_ops.py +3 -3
- build/torch212-cxx11-cu126-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch212-cxx11-cu126-x86_64-linux/metadata.json +1 -1
- build/torch212-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch212-cxx11-cu130-x86_64-linux/_ops.py +3 -3
- build/torch212-cxx11-cu130-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch212-cxx11-cu130-x86_64-linux/metadata.json +1 -1
- build/torch212-cxx11-cu132-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so} +1 -1
- build/torch212-cxx11-cu132-x86_64-linux/_ops.py +3 -3
- build/torch212-cxx11-cu132-x86_64-linux/flash_attn_interface.py +8 -1
- build/torch212-cxx11-cu132-x86_64-linux/metadata.json +1 -1
build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 448709080
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dfe72b621f62710dcc54348e86bc32386e2d51efd3da741ba07bd16007add8a0
|
| 3 |
size 448709080
|
build/torch210-cxx11-cu126-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch210-cxx11-cu126-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1037795600
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc937bef8777e779386411a3154b639da575a5485001e19fc602bd9cfe5c109c
|
| 3 |
size 1037795600
|
build/torch210-cxx11-cu128-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch210-cxx11-cu128-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1008994200
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d852266c6061ece6f6bb7a29f7107cf838636bc82432fef791dac810642ffeb
|
| 3 |
size 1008994200
|
build/torch210-cxx11-cu130-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch210-cxx11-cu130-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch211-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 448697832
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f62c82748346531b0e88011c252b2688bff4add42993ac6ac28b891a35f565c
|
| 3 |
size 448697832
|
build/torch211-cxx11-cu126-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch211-cxx11-cu126-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch211-cxx11-cu126-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch211-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1037788592
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7006b25ac0608ec8b1da90962070bd77efc163a221bc056172ff2f2f0b6e2c81
|
| 3 |
size 1037788592
|
build/torch211-cxx11-cu128-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch211-cxx11-cu128-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch211-cxx11-cu128-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch211-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1008987104
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf74261ec520a75d22aa355e834ca33ded833ae53f859e90f882672f988eff49
|
| 3 |
size 1008987104
|
build/torch211-cxx11-cu130-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch211-cxx11-cu130-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch211-cxx11-cu130-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch212-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 448703856
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55434e602df428ddc3780611d5358c7ff009159c3b66a065f7941446f51ea825
|
| 3 |
size 448703856
|
build/torch212-cxx11-cu126-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch212-cxx11-cu126-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch212-cxx11-cu126-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch212-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1008988960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59aea46816c69a682c3f1796df10140ae3ac7df1e8b3a742d4bf716a18c955f7
|
| 3 |
size 1008988960
|
build/torch212-cxx11-cu130-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch212-cxx11-cu130-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch212-cxx11-cu130-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
build/torch212-cxx11-cu132-x86_64-linux/{_flash_attn2_cuda_85c21a0.abi3.so → _flash_attn2_cuda_042c80b.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1026378680
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66999c12ec5a807e9160409ac09ad056f01ee2b60217df2088b5dc13bcb4cbf4
|
| 3 |
size 1026378680
|
build/torch212-cxx11-cu132-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_042c80b
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_042c80b
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_042c80b::{op_name}"
|
build/torch212-cxx11-cu132-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -36,7 +36,14 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
|
| 38 |
if device.type == "xpu":
|
| 39 |
+
if head_dim <= 96:
|
| 40 |
+
return 64
|
| 41 |
+
elif head_dim <= 128:
|
| 42 |
+
return 32
|
| 43 |
+
elif head_dim <= 256:
|
| 44 |
+
return 64
|
| 45 |
+
else:
|
| 46 |
+
return 32
|
| 47 |
|
| 48 |
# This should match the block sizes in the CUDA kernel
|
| 49 |
major, minor = torch.cuda.get_device_capability(device)
|
build/torch212-cxx11-cu132-x86_64-linux/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "flash-attn2",
|
| 3 |
+
"id": "_flash_attn2_cuda_042c80b",
|
| 4 |
"version": 1,
|
| 5 |
"license": "BSD-3-Clause",
|
| 6 |
"python-depends": [],
|