File size: 3,141 Bytes
8b60ef7 4e9c226 8b60ef7 4e9c226 8b60ef7 f622ea1 5d4178a f622ea1 8b60ef7 4e9c226 8b60ef7 f622ea1 8b60ef7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
[general]
name = "layer_norm"
universal = false
[torch]
src = [
"torch-ext/torch_binding.cpp",
"torch-ext/torch_binding.h",
]
[kernel.layer_norm]
depends = ["torch"]
backend = "cuda"
cuda-capabilities = [
"8.0",
"8.9",
"9.0",
"10.0",
"12.0",
]
include = ["."]
src = [
"layer_norm/ln.h",
"layer_norm/ln_api.cpp",
"layer_norm/ln_bwd_1024.cu",
"layer_norm/ln_bwd_1280.cu",
"layer_norm/ln_bwd_1536.cu",
"layer_norm/ln_bwd_2048.cu",
"layer_norm/ln_bwd_256.cu",
"layer_norm/ln_bwd_2560.cu",
"layer_norm/ln_bwd_3072.cu",
"layer_norm/ln_bwd_4096.cu",
"layer_norm/ln_bwd_512.cu",
"layer_norm/ln_bwd_5120.cu",
"layer_norm/ln_bwd_6144.cu",
"layer_norm/ln_bwd_7168.cu",
"layer_norm/ln_bwd_768.cu",
"layer_norm/ln_bwd_8192.cu",
"layer_norm/ln_bwd_kernels.cuh",
"layer_norm/ln_fwd_1024.cu",
"layer_norm/ln_fwd_1280.cu",
"layer_norm/ln_fwd_1536.cu",
"layer_norm/ln_fwd_2048.cu",
"layer_norm/ln_fwd_256.cu",
"layer_norm/ln_fwd_2560.cu",
"layer_norm/ln_fwd_3072.cu",
"layer_norm/ln_fwd_4096.cu",
"layer_norm/ln_fwd_512.cu",
"layer_norm/ln_fwd_5120.cu",
"layer_norm/ln_fwd_6144.cu",
"layer_norm/ln_fwd_7168.cu",
"layer_norm/ln_fwd_768.cu",
"layer_norm/ln_fwd_8192.cu",
"layer_norm/ln_fwd_kernels.cuh",
"layer_norm/ln_kernel_traits.h",
"layer_norm/ln_parallel_bwd_1024.cu",
"layer_norm/ln_parallel_bwd_1280.cu",
"layer_norm/ln_parallel_bwd_1536.cu",
"layer_norm/ln_parallel_bwd_2048.cu",
"layer_norm/ln_parallel_bwd_256.cu",
"layer_norm/ln_parallel_bwd_2560.cu",
"layer_norm/ln_parallel_bwd_3072.cu",
"layer_norm/ln_parallel_bwd_4096.cu",
"layer_norm/ln_parallel_bwd_512.cu",
"layer_norm/ln_parallel_bwd_5120.cu",
"layer_norm/ln_parallel_bwd_6144.cu",
"layer_norm/ln_parallel_bwd_7168.cu",
"layer_norm/ln_parallel_bwd_768.cu",
"layer_norm/ln_parallel_bwd_8192.cu",
"layer_norm/ln_parallel_fwd_1024.cu",
"layer_norm/ln_parallel_fwd_1280.cu",
"layer_norm/ln_parallel_fwd_1536.cu",
"layer_norm/ln_parallel_fwd_2048.cu",
"layer_norm/ln_parallel_fwd_256.cu",
"layer_norm/ln_parallel_fwd_2560.cu",
"layer_norm/ln_parallel_fwd_3072.cu",
"layer_norm/ln_parallel_fwd_4096.cu",
"layer_norm/ln_parallel_fwd_512.cu",
"layer_norm/ln_parallel_fwd_5120.cu",
"layer_norm/ln_parallel_fwd_6144.cu",
"layer_norm/ln_parallel_fwd_7168.cu",
"layer_norm/ln_parallel_fwd_768.cu",
"layer_norm/ln_parallel_fwd_8192.cu",
"layer_norm/ln_parallel_residual_bwd_kernels.cuh",
"layer_norm/ln_parallel_residual_fwd_kernels.cuh",
"layer_norm/ln_utils.cuh",
"layer_norm/static_switch.h"
]
cxx-flags = ["-DFLASHATTENTION_DISABLE_PYBIND", "-mcmodel=large"]
cuda-flags = [
"-O3",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
|