kernels-community
/

sage_attention

Model card Files Files and versions

sage_attention / build.toml

medmekk's picture

Update Builds

6d5dee5 4 months ago

3.83 kB

	[general]
	name = "sage_attention"
	backends = ["cuda"]

	[torch]
	src = [
	"torch-ext/torch_binding.cpp",
	"torch-ext/torch_binding.h",
	]

	[kernel._qattn_sm89]
	backend = "cuda"
	cuda-capabilities = ["8.9"]
	cuda-flags = [
	"-O3",
	"-std=c++17",
	"-U__CUDA_NO_HALF_OPERATORS__",
	"-U__CUDA_NO_HALF_CONVERSIONS__",
	"--use_fast_math",
	"--threads=1",
	"-Xptxas=-v",
	"-diag-suppress=174",
	]
	cuda-minver = "12.6"
	cxx-flags = [
	"-g",
	"-O3",
	"-fopenmp",
	"-lgomp",
	"-std=c++17",
	"-DENABLE_BF16",
	]
	depends = ["torch"]
	include = ["."]
	src = [
	"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu",
	"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu",
	"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu",
	"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu",
	"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu",
	"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu",
	"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu",
	"sage_attention/qattn/attn_cuda_sm89.h",
	"sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh",
	"sage_attention/qattn/attn_utils.cuh",
	]

	[kernel._qattn_sm90]
	backend = "cuda"
	cuda-capabilities = ["9.0a"]
	cuda-flags = [
	"-O3",
	"-std=c++17",
	"-U__CUDA_NO_HALF_OPERATORS__",
	"-U__CUDA_NO_HALF_CONVERSIONS__",
	"--use_fast_math",
	"--threads=1",
	"-Xptxas=-v",
	"-diag-suppress=174",
	]
	cuda-minver = "12.6"
	cxx-flags = [
	"-g",
	"-O3",
	"-fopenmp",
	"-lgomp",
	"-std=c++17",
	"-DENABLE_BF16",
	]
	depends = ["torch"]
	include = ["."]
	src = [
	"sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu",
	"sage_attention/qattn/attn_cuda_sm90.h",
	"sage_attention/qattn/attn_utils.cuh",
	]

	[kernel._qattn]
	backend = "cuda"
	cuda-capabilities = [
	"8.0",
	"8.9",
	"9.0a",
	]
	cuda-flags = [
	"-O3",
	"-std=c++17",
	"-U__CUDA_NO_HALF_OPERATORS__",
	"-U__CUDA_NO_HALF_CONVERSIONS__",
	"--use_fast_math",
	"--threads=1",
	"-Xptxas=-v",
	"-diag-suppress=174",
	]
	cuda-minver = "12.6"
	cxx-flags = [
	"-g",
	"-O3",
	"-fopenmp",
	"-lgomp",
	"-std=c++17",
	"-DENABLE_BF16",
	]
	depends = ["torch"]
	src = [
	"sage_attention/cp_async.cuh",
	"sage_attention/dispatch_utils.h",
	"sage_attention/math.cuh",
	"sage_attention/mma.cuh",
	"sage_attention/numeric_conversion.cuh",
	"sage_attention/permuted_smem.cuh",
	"sage_attention/reduction_utils.cuh",
	"sage_attention/wgmma.cuh",
	"sage_attention/utils.cuh",
	"sage_attention/cuda_tensormap_shim.cuh",
	]

	[kernel._qattn_sm80]
	backend = "cuda"
	cuda-capabilities = ["8.0"]
	cuda-flags = [
	"-O3",
	"-std=c++17",
	"-U__CUDA_NO_HALF_OPERATORS__",
	"-U__CUDA_NO_HALF_CONVERSIONS__",
	"--use_fast_math",
	"--threads=1",
	"-Xptxas=-v",
	"-diag-suppress=174",
	]
	cuda-minver = "12.6"
	cxx-flags = [
	"-g",
	"-O3",
	"-fopenmp",
	"-lgomp",
	"-std=c++17",
	"-DENABLE_BF16",
	]
	depends = ["torch"]
	include = ["."]
	src = [
	"sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu",
	"sage_attention/qattn/attn_cuda_sm80.h",
	"sage_attention/qattn/attn_utils.cuh",
	]

	[kernel._fused]
	backend = "cuda"
	cuda-capabilities = [
	"8.0",
	"8.9",
	"9.0a",
	]
	cuda-flags = [
	"-O3",
	"-std=c++17",
	"-U__CUDA_NO_HALF_OPERATORS__",
	"-U__CUDA_NO_HALF_CONVERSIONS__",
	"--use_fast_math",
	"--threads=1",
	"-Xptxas=-v",
	"-diag-suppress=174",
	]
	cuda-minver = "12.6"
	cxx-flags = [
	"-g",
	"-O3",
	"-fopenmp",
	"-lgomp",
	"-std=c++17",
	"-DENABLE_BF16",
	]
	depends = ["torch"]
	include = ["."]
	src = [
	"sage_attention/fused/fused.cu",
	"sage_attention/fused/fused.h",
	]