| cmake_minimum_required(VERSION 3.26) | |
| project(layer_norm LANGUAGES CXX) | |
| set(TARGET_DEVICE "cuda" CACHE STRING "Target device backend for kernel") | |
| install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) | |
| include(FetchContent) | |
| file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists | |
| message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") | |
| set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") | |
| set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") | |
| include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) | |
| if(DEFINED Python_EXECUTABLE) | |
| # Allow passing through the interpreter (e.g. from setup.py). | |
| find_package(Python COMPONENTS Development Development.SABIModule Interpreter) | |
| if (NOT Python_FOUND) | |
| message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") | |
| endif() | |
| else() | |
| find_package(Python REQUIRED COMPONENTS Development Development.SABIModule Interpreter) | |
| endif() | |
| append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") | |
| find_package(Torch REQUIRED) | |
| if (NOT TARGET_DEVICE STREQUAL "cuda" AND | |
| NOT TARGET_DEVICE STREQUAL "rocm") | |
| return() | |
| endif() | |
| if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND | |
| CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) | |
| set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0+PTX") | |
| else() | |
| set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX") | |
| endif() | |
| if (NOT HIP_FOUND AND CUDA_FOUND) | |
| set(GPU_LANG "CUDA") | |
| elseif(HIP_FOUND) | |
| set(GPU_LANG "HIP") | |
| # Importing torch recognizes and sets up some HIP/ROCm configuration but does | |
| # not let cmake recognize .hip files. In order to get cmake to understand the | |
| # .hip extension automatically, HIP must be enabled explicitly. | |
| enable_language(HIP) | |
| else() | |
| message(FATAL_ERROR "Can't find CUDA or HIP installation.") | |
| endif() | |
| if(GPU_LANG STREQUAL "CUDA") | |
| clear_cuda_arches(CUDA_ARCH_FLAGS) | |
| extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}") | |
| message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") | |
| # Filter the target architectures by the supported supported archs | |
| # since for some files we will build for all CUDA_ARCHS. | |
| cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") | |
| message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") | |
| if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA") | |
| list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}") | |
| endif() | |
| add_compile_definitions(CUDA_KERNEL) | |
| elseif(GPU_LANG STREQUAL "HIP") | |
| set(ROCM_ARCHS "${HIP_SUPPORTED_ARCHS}") | |
| # TODO: remove this once we can set specific archs per source file set. | |
| override_gpu_arches(GPU_ARCHES | |
| ${GPU_LANG} | |
| "${${GPU_LANG}_SUPPORTED_ARCHS}") | |
| add_compile_definitions(ROCM_KERNEL) | |
| else() | |
| override_gpu_arches(GPU_ARCHES | |
| ${GPU_LANG} | |
| "${${GPU_LANG}_SUPPORTED_ARCHS}") | |
| endif() | |
| get_torch_gpu_compiler_flags(TORCH_GPU_FLAGS ${GPU_LANG}) | |
| list(APPEND GPU_FLAGS ${TORCH_GPU_FLAGS}) | |
| set(TORCH_layer_norm_SRC | |
| torch-ext/torch_binding.cpp torch-ext/torch_binding.h | |
| ) | |
| list(APPEND SRC "${TORCH_layer_norm_SRC}") | |
| set(layer_norm_SRC | |
| "layer_norm/ln.h" | |
| "layer_norm/ln_api.cpp" | |
| "layer_norm/ln_bwd_1024.cu" | |
| "layer_norm/ln_bwd_1280.cu" | |
| "layer_norm/ln_bwd_1536.cu" | |
| "layer_norm/ln_bwd_2048.cu" | |
| "layer_norm/ln_bwd_256.cu" | |
| "layer_norm/ln_bwd_2560.cu" | |
| "layer_norm/ln_bwd_3072.cu" | |
| "layer_norm/ln_bwd_4096.cu" | |
| "layer_norm/ln_bwd_512.cu" | |
| "layer_norm/ln_bwd_5120.cu" | |
| "layer_norm/ln_bwd_6144.cu" | |
| "layer_norm/ln_bwd_7168.cu" | |
| "layer_norm/ln_bwd_768.cu" | |
| "layer_norm/ln_bwd_8192.cu" | |
| "layer_norm/ln_bwd_kernels.cuh" | |
| "layer_norm/ln_fwd_1024.cu" | |
| "layer_norm/ln_fwd_1280.cu" | |
| "layer_norm/ln_fwd_1536.cu" | |
| "layer_norm/ln_fwd_2048.cu" | |
| "layer_norm/ln_fwd_256.cu" | |
| "layer_norm/ln_fwd_2560.cu" | |
| "layer_norm/ln_fwd_3072.cu" | |
| "layer_norm/ln_fwd_4096.cu" | |
| "layer_norm/ln_fwd_512.cu" | |
| "layer_norm/ln_fwd_5120.cu" | |
| "layer_norm/ln_fwd_6144.cu" | |
| "layer_norm/ln_fwd_7168.cu" | |
| "layer_norm/ln_fwd_768.cu" | |
| "layer_norm/ln_fwd_8192.cu" | |
| "layer_norm/ln_fwd_kernels.cuh" | |
| "layer_norm/ln_kernel_traits.h" | |
| "layer_norm/ln_parallel_bwd_1024.cu" | |
| "layer_norm/ln_parallel_bwd_1280.cu" | |
| "layer_norm/ln_parallel_bwd_1536.cu" | |
| "layer_norm/ln_parallel_bwd_2048.cu" | |
| "layer_norm/ln_parallel_bwd_256.cu" | |
| "layer_norm/ln_parallel_bwd_2560.cu" | |
| "layer_norm/ln_parallel_bwd_3072.cu" | |
| "layer_norm/ln_parallel_bwd_4096.cu" | |
| "layer_norm/ln_parallel_bwd_512.cu" | |
| "layer_norm/ln_parallel_bwd_5120.cu" | |
| "layer_norm/ln_parallel_bwd_6144.cu" | |
| "layer_norm/ln_parallel_bwd_7168.cu" | |
| "layer_norm/ln_parallel_bwd_768.cu" | |
| "layer_norm/ln_parallel_bwd_8192.cu" | |
| "layer_norm/ln_parallel_fwd_1024.cu" | |
| "layer_norm/ln_parallel_fwd_1280.cu" | |
| "layer_norm/ln_parallel_fwd_1536.cu" | |
| "layer_norm/ln_parallel_fwd_2048.cu" | |
| "layer_norm/ln_parallel_fwd_256.cu" | |
| "layer_norm/ln_parallel_fwd_2560.cu" | |
| "layer_norm/ln_parallel_fwd_3072.cu" | |
| "layer_norm/ln_parallel_fwd_4096.cu" | |
| "layer_norm/ln_parallel_fwd_512.cu" | |
| "layer_norm/ln_parallel_fwd_5120.cu" | |
| "layer_norm/ln_parallel_fwd_6144.cu" | |
| "layer_norm/ln_parallel_fwd_7168.cu" | |
| "layer_norm/ln_parallel_fwd_768.cu" | |
| "layer_norm/ln_parallel_fwd_8192.cu" | |
| "layer_norm/ln_parallel_residual_bwd_kernels.cuh" | |
| "layer_norm/ln_parallel_residual_fwd_kernels.cuh" | |
| "layer_norm/ln_utils.cuh" | |
| "layer_norm/static_switch.h" | |
| ) | |
| # TODO: check if CLion support this: | |
| # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories | |
| set_source_files_properties( | |
| ${layer_norm_SRC} | |
| PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.") | |
| if(GPU_LANG STREQUAL "CUDA") | |
| cuda_archs_loose_intersection(layer_norm_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}") | |
| message(STATUS "Capabilities for kernel layer_norm: ${layer_norm_ARCHS}") | |
| set_gencode_flags_for_srcs(SRCS "${layer_norm_SRC}" CUDA_ARCHS "${layer_norm_ARCHS}") | |
| foreach(_KERNEL_SRC ${layer_norm_SRC}) | |
| if(_KERNEL_SRC MATCHES ".*\\.cu$") | |
| set_property( | |
| SOURCE ${_KERNEL_SRC} | |
| APPEND PROPERTY | |
| COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-O3;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;-U__CUDA_NO_BFLOAT16_OPERATORS__;-U__CUDA_NO_BFLOAT16_CONVERSIONS__;-U__CUDA_NO_BFLOAT162_OPERATORS__;-U__CUDA_NO_BFLOAT162_CONVERSIONS__;--expt-relaxed-constexpr;--expt-extended-lambda;--use_fast_math>" | |
| ) | |
| endif() | |
| endforeach() | |
| foreach(_KERNEL_SRC ${layer_norm_SRC}) | |
| set_property( | |
| SOURCE ${_KERNEL_SRC} | |
| APPEND PROPERTY | |
| COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:-DFLASHATTENTION_DISABLE_PYBIND>" | |
| ) | |
| endforeach() | |
| list(APPEND SRC "${layer_norm_SRC}") | |
| endif() | |
| define_gpu_extension_target( | |
| _layer_norm_711aa42_dirty | |
| DESTINATION _layer_norm_711aa42_dirty | |
| LANGUAGE ${GPU_LANG} | |
| SOURCES ${SRC} | |
| COMPILE_FLAGS ${GPU_FLAGS} | |
| ARCHITECTURES ${GPU_ARCHES} | |
| #INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} | |
| USE_SABI 3 | |
| WITH_SOABI) | |
| target_link_options(_layer_norm_711aa42_dirty PRIVATE -static-libstdc++) | |