Update build.toml
Browse files- build.toml +5 -9
build.toml
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
[general]
|
| 2 |
name = "vllm_flash_attn3"
|
| 3 |
universal = false
|
| 4 |
-
cuda-minver = "12.
|
| 5 |
-
cuda-maxver = "12.
|
| 6 |
|
| 7 |
[torch]
|
| 8 |
src = [
|
|
@@ -18,8 +18,6 @@ cuda-flags = [
|
|
| 18 |
"-O3",
|
| 19 |
"-std=c++17",
|
| 20 |
"--ftemplate-backtrace-limit=0", # To debug template code
|
| 21 |
-
"--use_fast_math",
|
| 22 |
-
"-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1"
|
| 23 |
"--expt-relaxed-constexpr",
|
| 24 |
"--expt-extended-lambda",
|
| 25 |
"--use_fast_math",
|
|
@@ -44,17 +42,15 @@ depends = ["torch", "cutlass_3_9"]
|
|
| 44 |
|
| 45 |
[kernel.flash_attn_sm80]
|
| 46 |
backend = "cuda"
|
| 47 |
-
+cuda-capabilities = ["8.0", "8.6"
|
| 48 |
cuda-flags = [
|
| 49 |
"-O3",
|
| 50 |
"-std=c++17",
|
| 51 |
"--ftemplate-backtrace-limit=0", # To debug template code
|
| 52 |
-
"--use_fast_math",
|
| 53 |
-
"-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
|
| 54 |
"--expt-relaxed-constexpr",
|
| 55 |
"--expt-extended-lambda",
|
| 56 |
-
"--use_fast_math",
|
| 57 |
"-DNDEBUG",
|
|
|
|
| 58 |
]
|
| 59 |
src = [
|
| 60 |
"flash-attn/block.h",
|
|
@@ -191,7 +187,7 @@ depends = ["torch", "cutlass_3_9"]
|
|
| 191 |
|
| 192 |
[kernel.flash_attn_sm90]
|
| 193 |
backend = "cuda"
|
| 194 |
-
cuda-capabilities = ["
|
| 195 |
cuda-flags = [
|
| 196 |
"-O3",
|
| 197 |
"-std=c++17",
|
|
|
|
| 1 |
[general]
|
| 2 |
name = "vllm_flash_attn3"
|
| 3 |
universal = false
|
| 4 |
+
cuda-minver = "12.0"
|
| 5 |
+
cuda-maxver = "12.8"
|
| 6 |
|
| 7 |
[torch]
|
| 8 |
src = [
|
|
|
|
| 18 |
"-O3",
|
| 19 |
"-std=c++17",
|
| 20 |
"--ftemplate-backtrace-limit=0", # To debug template code
|
|
|
|
|
|
|
| 21 |
"--expt-relaxed-constexpr",
|
| 22 |
"--expt-extended-lambda",
|
| 23 |
"--use_fast_math",
|
|
|
|
| 42 |
|
| 43 |
[kernel.flash_attn_sm80]
|
| 44 |
backend = "cuda"
|
| 45 |
+
+cuda-capabilities = ["8.0", "8.6"]
|
| 46 |
cuda-flags = [
|
| 47 |
"-O3",
|
| 48 |
"-std=c++17",
|
| 49 |
"--ftemplate-backtrace-limit=0", # To debug template code
|
|
|
|
|
|
|
| 50 |
"--expt-relaxed-constexpr",
|
| 51 |
"--expt-extended-lambda",
|
|
|
|
| 52 |
"-DNDEBUG",
|
| 53 |
+
"-DFLASHATTENTION_DISABLE_FP8"
|
| 54 |
]
|
| 55 |
src = [
|
| 56 |
"flash-attn/block.h",
|
|
|
|
| 187 |
|
| 188 |
[kernel.flash_attn_sm90]
|
| 189 |
backend = "cuda"
|
| 190 |
+
cuda-capabilities = ["9.0a"]
|
| 191 |
cuda-flags = [
|
| 192 |
"-O3",
|
| 193 |
"-std=c++17",
|