diff --git a/build.toml b/build.toml index f635b1cb8fdf3eea901dcede1a2764f0dff4ff1a..2c8a3bdab086d0953941fca59e130a61e8955f8b 100644 --- a/build.toml +++ b/build.toml @@ -1,5 +1,5 @@ [general] -name = "layer-norm" +name = "layer_norm" universal = false [torch] @@ -8,76 +8,76 @@ src = [ "torch-ext/torch_binding.h", ] -[kernel.layer-norm] +[kernel.layer_norm] depends = ["torch"] backend = "cuda" include = ["."] src = [ - "layer-norm/ln.h", - "layer-norm/ln_api.cpp", - "layer-norm/ln_bwd_1024.cu", - "layer-norm/ln_bwd_1280.cu", - "layer-norm/ln_bwd_1536.cu", - "layer-norm/ln_bwd_2048.cu", - "layer-norm/ln_bwd_256.cu", - "layer-norm/ln_bwd_2560.cu", - "layer-norm/ln_bwd_3072.cu", - "layer-norm/ln_bwd_4096.cu", - "layer-norm/ln_bwd_512.cu", - "layer-norm/ln_bwd_5120.cu", - "layer-norm/ln_bwd_6144.cu", - "layer-norm/ln_bwd_7168.cu", - "layer-norm/ln_bwd_768.cu", - "layer-norm/ln_bwd_8192.cu", - "layer-norm/ln_bwd_kernels.cuh", - "layer-norm/ln_fwd_1024.cu", - "layer-norm/ln_fwd_1280.cu", - "layer-norm/ln_fwd_1536.cu", - "layer-norm/ln_fwd_2048.cu", - "layer-norm/ln_fwd_256.cu", - "layer-norm/ln_fwd_2560.cu", - "layer-norm/ln_fwd_3072.cu", - "layer-norm/ln_fwd_4096.cu", - "layer-norm/ln_fwd_512.cu", - "layer-norm/ln_fwd_5120.cu", - "layer-norm/ln_fwd_6144.cu", - "layer-norm/ln_fwd_7168.cu", - "layer-norm/ln_fwd_768.cu", - "layer-norm/ln_fwd_8192.cu", - "layer-norm/ln_fwd_kernels.cuh", - "layer-norm/ln_kernel_traits.h", - "layer-norm/ln_parallel_bwd_1024.cu", - "layer-norm/ln_parallel_bwd_1280.cu", - "layer-norm/ln_parallel_bwd_1536.cu", - "layer-norm/ln_parallel_bwd_2048.cu", - "layer-norm/ln_parallel_bwd_256.cu", - "layer-norm/ln_parallel_bwd_2560.cu", - "layer-norm/ln_parallel_bwd_3072.cu", - "layer-norm/ln_parallel_bwd_4096.cu", - "layer-norm/ln_parallel_bwd_512.cu", - "layer-norm/ln_parallel_bwd_5120.cu", - "layer-norm/ln_parallel_bwd_6144.cu", - "layer-norm/ln_parallel_bwd_7168.cu", - "layer-norm/ln_parallel_bwd_768.cu", - "layer-norm/ln_parallel_bwd_8192.cu", - "layer-norm/ln_parallel_fwd_1024.cu", - "layer-norm/ln_parallel_fwd_1280.cu", - "layer-norm/ln_parallel_fwd_1536.cu", - "layer-norm/ln_parallel_fwd_2048.cu", - "layer-norm/ln_parallel_fwd_256.cu", - "layer-norm/ln_parallel_fwd_2560.cu", - "layer-norm/ln_parallel_fwd_3072.cu", - "layer-norm/ln_parallel_fwd_4096.cu", - "layer-norm/ln_parallel_fwd_512.cu", - "layer-norm/ln_parallel_fwd_5120.cu", - "layer-norm/ln_parallel_fwd_6144.cu", - "layer-norm/ln_parallel_fwd_7168.cu", - "layer-norm/ln_parallel_fwd_768.cu", - "layer-norm/ln_parallel_fwd_8192.cu", - "layer-norm/ln_parallel_residual_bwd_kernels.cuh", - "layer-norm/ln_parallel_residual_fwd_kernels.cuh", - "layer-norm/ln_utils.cuh", - "layer-norm/static_switch.h" + "layer_norm/ln.h", + "layer_norm/ln_api.cpp", + "layer_norm/ln_bwd_1024.cu", + "layer_norm/ln_bwd_1280.cu", + "layer_norm/ln_bwd_1536.cu", + "layer_norm/ln_bwd_2048.cu", + "layer_norm/ln_bwd_256.cu", + "layer_norm/ln_bwd_2560.cu", + "layer_norm/ln_bwd_3072.cu", + "layer_norm/ln_bwd_4096.cu", + "layer_norm/ln_bwd_512.cu", + "layer_norm/ln_bwd_5120.cu", + "layer_norm/ln_bwd_6144.cu", + "layer_norm/ln_bwd_7168.cu", + "layer_norm/ln_bwd_768.cu", + "layer_norm/ln_bwd_8192.cu", + "layer_norm/ln_bwd_kernels.cuh", + "layer_norm/ln_fwd_1024.cu", + "layer_norm/ln_fwd_1280.cu", + "layer_norm/ln_fwd_1536.cu", + "layer_norm/ln_fwd_2048.cu", + "layer_norm/ln_fwd_256.cu", + "layer_norm/ln_fwd_2560.cu", + "layer_norm/ln_fwd_3072.cu", + "layer_norm/ln_fwd_4096.cu", + "layer_norm/ln_fwd_512.cu", + "layer_norm/ln_fwd_5120.cu", + "layer_norm/ln_fwd_6144.cu", + "layer_norm/ln_fwd_7168.cu", + "layer_norm/ln_fwd_768.cu", + "layer_norm/ln_fwd_8192.cu", + "layer_norm/ln_fwd_kernels.cuh", + "layer_norm/ln_kernel_traits.h", + "layer_norm/ln_parallel_bwd_1024.cu", + "layer_norm/ln_parallel_bwd_1280.cu", + "layer_norm/ln_parallel_bwd_1536.cu", + "layer_norm/ln_parallel_bwd_2048.cu", + "layer_norm/ln_parallel_bwd_256.cu", + "layer_norm/ln_parallel_bwd_2560.cu", + "layer_norm/ln_parallel_bwd_3072.cu", + "layer_norm/ln_parallel_bwd_4096.cu", + "layer_norm/ln_parallel_bwd_512.cu", + "layer_norm/ln_parallel_bwd_5120.cu", + "layer_norm/ln_parallel_bwd_6144.cu", + "layer_norm/ln_parallel_bwd_7168.cu", + "layer_norm/ln_parallel_bwd_768.cu", + "layer_norm/ln_parallel_bwd_8192.cu", + "layer_norm/ln_parallel_fwd_1024.cu", + "layer_norm/ln_parallel_fwd_1280.cu", + "layer_norm/ln_parallel_fwd_1536.cu", + "layer_norm/ln_parallel_fwd_2048.cu", + "layer_norm/ln_parallel_fwd_256.cu", + "layer_norm/ln_parallel_fwd_2560.cu", + "layer_norm/ln_parallel_fwd_3072.cu", + "layer_norm/ln_parallel_fwd_4096.cu", + "layer_norm/ln_parallel_fwd_512.cu", + "layer_norm/ln_parallel_fwd_5120.cu", + "layer_norm/ln_parallel_fwd_6144.cu", + "layer_norm/ln_parallel_fwd_7168.cu", + "layer_norm/ln_parallel_fwd_768.cu", + "layer_norm/ln_parallel_fwd_8192.cu", + "layer_norm/ln_parallel_residual_bwd_kernels.cuh", + "layer_norm/ln_parallel_residual_fwd_kernels.cuh", + "layer_norm/ln_utils.cuh", + "layer_norm/static_switch.h" ] cxx-flags = ["-DFLASHATTENTION_DISABLE_PYBIND"] cuda-flags = [ diff --git a/layer-norm/ln.h b/layer_norm/ln.h similarity index 100% rename from layer-norm/ln.h rename to layer_norm/ln.h diff --git a/layer-norm/ln_api.cpp b/layer_norm/ln_api.cpp similarity index 100% rename from layer-norm/ln_api.cpp rename to layer_norm/ln_api.cpp diff --git a/layer-norm/ln_bwd_1024.cu b/layer_norm/ln_bwd_1024.cu similarity index 100% rename from layer-norm/ln_bwd_1024.cu rename to layer_norm/ln_bwd_1024.cu diff --git a/layer-norm/ln_bwd_1280.cu b/layer_norm/ln_bwd_1280.cu similarity index 100% rename from layer-norm/ln_bwd_1280.cu rename to layer_norm/ln_bwd_1280.cu diff --git a/layer-norm/ln_bwd_1536.cu b/layer_norm/ln_bwd_1536.cu similarity index 100% rename from layer-norm/ln_bwd_1536.cu rename to layer_norm/ln_bwd_1536.cu diff --git a/layer-norm/ln_bwd_2048.cu b/layer_norm/ln_bwd_2048.cu similarity index 100% rename from layer-norm/ln_bwd_2048.cu rename to layer_norm/ln_bwd_2048.cu diff --git a/layer-norm/ln_bwd_256.cu b/layer_norm/ln_bwd_256.cu similarity index 100% rename from layer-norm/ln_bwd_256.cu rename to layer_norm/ln_bwd_256.cu diff --git a/layer-norm/ln_bwd_2560.cu b/layer_norm/ln_bwd_2560.cu similarity index 100% rename from layer-norm/ln_bwd_2560.cu rename to layer_norm/ln_bwd_2560.cu diff --git a/layer-norm/ln_bwd_3072.cu b/layer_norm/ln_bwd_3072.cu similarity index 100% rename from layer-norm/ln_bwd_3072.cu rename to layer_norm/ln_bwd_3072.cu diff --git a/layer-norm/ln_bwd_4096.cu b/layer_norm/ln_bwd_4096.cu similarity index 100% rename from layer-norm/ln_bwd_4096.cu rename to layer_norm/ln_bwd_4096.cu diff --git a/layer-norm/ln_bwd_512.cu b/layer_norm/ln_bwd_512.cu similarity index 100% rename from layer-norm/ln_bwd_512.cu rename to layer_norm/ln_bwd_512.cu diff --git a/layer-norm/ln_bwd_5120.cu b/layer_norm/ln_bwd_5120.cu similarity index 100% rename from layer-norm/ln_bwd_5120.cu rename to layer_norm/ln_bwd_5120.cu diff --git a/layer-norm/ln_bwd_6144.cu b/layer_norm/ln_bwd_6144.cu similarity index 100% rename from layer-norm/ln_bwd_6144.cu rename to layer_norm/ln_bwd_6144.cu diff --git a/layer-norm/ln_bwd_7168.cu b/layer_norm/ln_bwd_7168.cu similarity index 100% rename from layer-norm/ln_bwd_7168.cu rename to layer_norm/ln_bwd_7168.cu diff --git a/layer-norm/ln_bwd_768.cu b/layer_norm/ln_bwd_768.cu similarity index 100% rename from layer-norm/ln_bwd_768.cu rename to layer_norm/ln_bwd_768.cu diff --git a/layer-norm/ln_bwd_8192.cu b/layer_norm/ln_bwd_8192.cu similarity index 100% rename from layer-norm/ln_bwd_8192.cu rename to layer_norm/ln_bwd_8192.cu diff --git a/layer-norm/ln_bwd_kernels.cuh b/layer_norm/ln_bwd_kernels.cuh similarity index 100% rename from layer-norm/ln_bwd_kernels.cuh rename to layer_norm/ln_bwd_kernels.cuh diff --git a/layer-norm/ln_fwd_1024.cu b/layer_norm/ln_fwd_1024.cu similarity index 100% rename from layer-norm/ln_fwd_1024.cu rename to layer_norm/ln_fwd_1024.cu diff --git a/layer-norm/ln_fwd_1280.cu b/layer_norm/ln_fwd_1280.cu similarity index 100% rename from layer-norm/ln_fwd_1280.cu rename to layer_norm/ln_fwd_1280.cu diff --git a/layer-norm/ln_fwd_1536.cu b/layer_norm/ln_fwd_1536.cu similarity index 100% rename from layer-norm/ln_fwd_1536.cu rename to layer_norm/ln_fwd_1536.cu diff --git a/layer-norm/ln_fwd_2048.cu b/layer_norm/ln_fwd_2048.cu similarity index 100% rename from layer-norm/ln_fwd_2048.cu rename to layer_norm/ln_fwd_2048.cu diff --git a/layer-norm/ln_fwd_256.cu b/layer_norm/ln_fwd_256.cu similarity index 100% rename from layer-norm/ln_fwd_256.cu rename to layer_norm/ln_fwd_256.cu diff --git a/layer-norm/ln_fwd_2560.cu b/layer_norm/ln_fwd_2560.cu similarity index 100% rename from layer-norm/ln_fwd_2560.cu rename to layer_norm/ln_fwd_2560.cu diff --git a/layer-norm/ln_fwd_3072.cu b/layer_norm/ln_fwd_3072.cu similarity index 100% rename from layer-norm/ln_fwd_3072.cu rename to layer_norm/ln_fwd_3072.cu diff --git a/layer-norm/ln_fwd_4096.cu b/layer_norm/ln_fwd_4096.cu similarity index 100% rename from layer-norm/ln_fwd_4096.cu rename to layer_norm/ln_fwd_4096.cu diff --git a/layer-norm/ln_fwd_512.cu b/layer_norm/ln_fwd_512.cu similarity index 100% rename from layer-norm/ln_fwd_512.cu rename to layer_norm/ln_fwd_512.cu diff --git a/layer-norm/ln_fwd_5120.cu b/layer_norm/ln_fwd_5120.cu similarity index 100% rename from layer-norm/ln_fwd_5120.cu rename to layer_norm/ln_fwd_5120.cu diff --git a/layer-norm/ln_fwd_6144.cu b/layer_norm/ln_fwd_6144.cu similarity index 100% rename from layer-norm/ln_fwd_6144.cu rename to layer_norm/ln_fwd_6144.cu diff --git a/layer-norm/ln_fwd_7168.cu b/layer_norm/ln_fwd_7168.cu similarity index 100% rename from layer-norm/ln_fwd_7168.cu rename to layer_norm/ln_fwd_7168.cu diff --git a/layer-norm/ln_fwd_768.cu b/layer_norm/ln_fwd_768.cu similarity index 100% rename from layer-norm/ln_fwd_768.cu rename to layer_norm/ln_fwd_768.cu diff --git a/layer-norm/ln_fwd_8192.cu b/layer_norm/ln_fwd_8192.cu similarity index 100% rename from layer-norm/ln_fwd_8192.cu rename to layer_norm/ln_fwd_8192.cu diff --git a/layer-norm/ln_fwd_kernels.cuh b/layer_norm/ln_fwd_kernels.cuh similarity index 100% rename from layer-norm/ln_fwd_kernels.cuh rename to layer_norm/ln_fwd_kernels.cuh diff --git a/layer-norm/ln_kernel_traits.h b/layer_norm/ln_kernel_traits.h similarity index 100% rename from layer-norm/ln_kernel_traits.h rename to layer_norm/ln_kernel_traits.h diff --git a/layer-norm/ln_parallel_bwd_1024.cu b/layer_norm/ln_parallel_bwd_1024.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_1024.cu rename to layer_norm/ln_parallel_bwd_1024.cu diff --git a/layer-norm/ln_parallel_bwd_1280.cu b/layer_norm/ln_parallel_bwd_1280.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_1280.cu rename to layer_norm/ln_parallel_bwd_1280.cu diff --git a/layer-norm/ln_parallel_bwd_1536.cu b/layer_norm/ln_parallel_bwd_1536.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_1536.cu rename to layer_norm/ln_parallel_bwd_1536.cu diff --git a/layer-norm/ln_parallel_bwd_2048.cu b/layer_norm/ln_parallel_bwd_2048.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_2048.cu rename to layer_norm/ln_parallel_bwd_2048.cu diff --git a/layer-norm/ln_parallel_bwd_256.cu b/layer_norm/ln_parallel_bwd_256.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_256.cu rename to layer_norm/ln_parallel_bwd_256.cu diff --git a/layer-norm/ln_parallel_bwd_2560.cu b/layer_norm/ln_parallel_bwd_2560.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_2560.cu rename to layer_norm/ln_parallel_bwd_2560.cu diff --git a/layer-norm/ln_parallel_bwd_3072.cu b/layer_norm/ln_parallel_bwd_3072.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_3072.cu rename to layer_norm/ln_parallel_bwd_3072.cu diff --git a/layer-norm/ln_parallel_bwd_4096.cu b/layer_norm/ln_parallel_bwd_4096.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_4096.cu rename to layer_norm/ln_parallel_bwd_4096.cu diff --git a/layer-norm/ln_parallel_bwd_512.cu b/layer_norm/ln_parallel_bwd_512.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_512.cu rename to layer_norm/ln_parallel_bwd_512.cu diff --git a/layer-norm/ln_parallel_bwd_5120.cu b/layer_norm/ln_parallel_bwd_5120.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_5120.cu rename to layer_norm/ln_parallel_bwd_5120.cu diff --git a/layer-norm/ln_parallel_bwd_6144.cu b/layer_norm/ln_parallel_bwd_6144.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_6144.cu rename to layer_norm/ln_parallel_bwd_6144.cu diff --git a/layer-norm/ln_parallel_bwd_7168.cu b/layer_norm/ln_parallel_bwd_7168.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_7168.cu rename to layer_norm/ln_parallel_bwd_7168.cu diff --git a/layer-norm/ln_parallel_bwd_768.cu b/layer_norm/ln_parallel_bwd_768.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_768.cu rename to layer_norm/ln_parallel_bwd_768.cu diff --git a/layer-norm/ln_parallel_bwd_8192.cu b/layer_norm/ln_parallel_bwd_8192.cu similarity index 100% rename from layer-norm/ln_parallel_bwd_8192.cu rename to layer_norm/ln_parallel_bwd_8192.cu diff --git a/layer-norm/ln_parallel_fwd_1024.cu b/layer_norm/ln_parallel_fwd_1024.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_1024.cu rename to layer_norm/ln_parallel_fwd_1024.cu diff --git a/layer-norm/ln_parallel_fwd_1280.cu b/layer_norm/ln_parallel_fwd_1280.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_1280.cu rename to layer_norm/ln_parallel_fwd_1280.cu diff --git a/layer-norm/ln_parallel_fwd_1536.cu b/layer_norm/ln_parallel_fwd_1536.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_1536.cu rename to layer_norm/ln_parallel_fwd_1536.cu diff --git a/layer-norm/ln_parallel_fwd_2048.cu b/layer_norm/ln_parallel_fwd_2048.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_2048.cu rename to layer_norm/ln_parallel_fwd_2048.cu diff --git a/layer-norm/ln_parallel_fwd_256.cu b/layer_norm/ln_parallel_fwd_256.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_256.cu rename to layer_norm/ln_parallel_fwd_256.cu diff --git a/layer-norm/ln_parallel_fwd_2560.cu b/layer_norm/ln_parallel_fwd_2560.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_2560.cu rename to layer_norm/ln_parallel_fwd_2560.cu diff --git a/layer-norm/ln_parallel_fwd_3072.cu b/layer_norm/ln_parallel_fwd_3072.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_3072.cu rename to layer_norm/ln_parallel_fwd_3072.cu diff --git a/layer-norm/ln_parallel_fwd_4096.cu b/layer_norm/ln_parallel_fwd_4096.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_4096.cu rename to layer_norm/ln_parallel_fwd_4096.cu diff --git a/layer-norm/ln_parallel_fwd_512.cu b/layer_norm/ln_parallel_fwd_512.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_512.cu rename to layer_norm/ln_parallel_fwd_512.cu diff --git a/layer-norm/ln_parallel_fwd_5120.cu b/layer_norm/ln_parallel_fwd_5120.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_5120.cu rename to layer_norm/ln_parallel_fwd_5120.cu diff --git a/layer-norm/ln_parallel_fwd_6144.cu b/layer_norm/ln_parallel_fwd_6144.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_6144.cu rename to layer_norm/ln_parallel_fwd_6144.cu diff --git a/layer-norm/ln_parallel_fwd_7168.cu b/layer_norm/ln_parallel_fwd_7168.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_7168.cu rename to layer_norm/ln_parallel_fwd_7168.cu diff --git a/layer-norm/ln_parallel_fwd_768.cu b/layer_norm/ln_parallel_fwd_768.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_768.cu rename to layer_norm/ln_parallel_fwd_768.cu diff --git a/layer-norm/ln_parallel_fwd_8192.cu b/layer_norm/ln_parallel_fwd_8192.cu similarity index 100% rename from layer-norm/ln_parallel_fwd_8192.cu rename to layer_norm/ln_parallel_fwd_8192.cu diff --git a/layer-norm/ln_parallel_residual_bwd_kernels.cuh b/layer_norm/ln_parallel_residual_bwd_kernels.cuh similarity index 100% rename from layer-norm/ln_parallel_residual_bwd_kernels.cuh rename to layer_norm/ln_parallel_residual_bwd_kernels.cuh diff --git a/layer-norm/ln_parallel_residual_fwd_kernels.cuh b/layer_norm/ln_parallel_residual_fwd_kernels.cuh similarity index 100% rename from layer-norm/ln_parallel_residual_fwd_kernels.cuh rename to layer_norm/ln_parallel_residual_fwd_kernels.cuh diff --git a/layer-norm/ln_utils.cuh b/layer_norm/ln_utils.cuh similarity index 100% rename from layer-norm/ln_utils.cuh rename to layer_norm/ln_utils.cuh diff --git a/layer-norm/static_switch.h b/layer_norm/static_switch.h similarity index 100% rename from layer-norm/static_switch.h rename to layer_norm/static_switch.h diff --git a/setup.py b/setup_backup.py similarity index 100% rename from setup.py rename to setup_backup.py diff --git a/torch-ext/layer-norm/__init__.py b/torch-ext/layer_norm/__init__.py similarity index 100% rename from torch-ext/layer-norm/__init__.py rename to torch-ext/layer_norm/__init__.py diff --git a/torch-ext/layer-norm/layers.py b/torch-ext/layer_norm/layers.py similarity index 100% rename from torch-ext/layer-norm/layers.py rename to torch-ext/layer_norm/layers.py