File size: 3,141 Bytes
8b60ef7
4e9c226
8b60ef7
 
 
 
 
 
 
 
4e9c226
8b60ef7
 
f622ea1
5d4178a
 
 
 
 
f622ea1
8b60ef7
 
4e9c226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b60ef7
f622ea1
8b60ef7
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
[general]
name = "layer_norm"
universal = false

[torch]
src = [
  "torch-ext/torch_binding.cpp",
  "torch-ext/torch_binding.h",
]

[kernel.layer_norm]
depends = ["torch"]
backend = "cuda"
cuda-capabilities = [
    "8.0",
    "8.9",
    "9.0",
    "10.0",
    "12.0",
]
include = ["."]
src = [
    "layer_norm/ln.h",
    "layer_norm/ln_api.cpp",
    "layer_norm/ln_bwd_1024.cu",
    "layer_norm/ln_bwd_1280.cu",
    "layer_norm/ln_bwd_1536.cu",
    "layer_norm/ln_bwd_2048.cu",
    "layer_norm/ln_bwd_256.cu",
    "layer_norm/ln_bwd_2560.cu",
    "layer_norm/ln_bwd_3072.cu",
    "layer_norm/ln_bwd_4096.cu",
    "layer_norm/ln_bwd_512.cu",
    "layer_norm/ln_bwd_5120.cu",
    "layer_norm/ln_bwd_6144.cu",
    "layer_norm/ln_bwd_7168.cu",
    "layer_norm/ln_bwd_768.cu",
    "layer_norm/ln_bwd_8192.cu",
    "layer_norm/ln_bwd_kernels.cuh",
    "layer_norm/ln_fwd_1024.cu",
    "layer_norm/ln_fwd_1280.cu",
    "layer_norm/ln_fwd_1536.cu",
    "layer_norm/ln_fwd_2048.cu",
    "layer_norm/ln_fwd_256.cu",
    "layer_norm/ln_fwd_2560.cu",
    "layer_norm/ln_fwd_3072.cu",
    "layer_norm/ln_fwd_4096.cu",
    "layer_norm/ln_fwd_512.cu",
    "layer_norm/ln_fwd_5120.cu",
    "layer_norm/ln_fwd_6144.cu",
    "layer_norm/ln_fwd_7168.cu",
    "layer_norm/ln_fwd_768.cu",
    "layer_norm/ln_fwd_8192.cu",
    "layer_norm/ln_fwd_kernels.cuh",
    "layer_norm/ln_kernel_traits.h",
    "layer_norm/ln_parallel_bwd_1024.cu",
    "layer_norm/ln_parallel_bwd_1280.cu",
    "layer_norm/ln_parallel_bwd_1536.cu",
    "layer_norm/ln_parallel_bwd_2048.cu",
    "layer_norm/ln_parallel_bwd_256.cu",
    "layer_norm/ln_parallel_bwd_2560.cu",
    "layer_norm/ln_parallel_bwd_3072.cu",
    "layer_norm/ln_parallel_bwd_4096.cu",
    "layer_norm/ln_parallel_bwd_512.cu",
    "layer_norm/ln_parallel_bwd_5120.cu",
    "layer_norm/ln_parallel_bwd_6144.cu",
    "layer_norm/ln_parallel_bwd_7168.cu",
    "layer_norm/ln_parallel_bwd_768.cu",
    "layer_norm/ln_parallel_bwd_8192.cu",
    "layer_norm/ln_parallel_fwd_1024.cu",
    "layer_norm/ln_parallel_fwd_1280.cu",
    "layer_norm/ln_parallel_fwd_1536.cu",
    "layer_norm/ln_parallel_fwd_2048.cu",
    "layer_norm/ln_parallel_fwd_256.cu",
    "layer_norm/ln_parallel_fwd_2560.cu",
    "layer_norm/ln_parallel_fwd_3072.cu",
    "layer_norm/ln_parallel_fwd_4096.cu",
    "layer_norm/ln_parallel_fwd_512.cu",
    "layer_norm/ln_parallel_fwd_5120.cu",
    "layer_norm/ln_parallel_fwd_6144.cu",
    "layer_norm/ln_parallel_fwd_7168.cu",
    "layer_norm/ln_parallel_fwd_768.cu",
    "layer_norm/ln_parallel_fwd_8192.cu",
    "layer_norm/ln_parallel_residual_bwd_kernels.cuh",
    "layer_norm/ln_parallel_residual_fwd_kernels.cuh",
    "layer_norm/ln_utils.cuh",
    "layer_norm/static_switch.h"
]
cxx-flags = ["-DFLASHATTENTION_DISABLE_PYBIND", "-mcmodel=large"]
cuda-flags = [
    "-O3",
    "-U__CUDA_NO_HALF_OPERATORS__",
    "-U__CUDA_NO_HALF_CONVERSIONS__",
    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
    "-U__CUDA_NO_BFLOAT162_OPERATORS__",
    "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
    "--expt-relaxed-constexpr",
    "--expt-extended-lambda",
    "--use_fast_math",
]