layer_norm / CMakeLists.txt
medmekk's picture
medmekk HF Staff
add 9.0 build
f622ea1
raw
history blame
6.94 kB
cmake_minimum_required(VERSION 3.26)
project(layer_norm LANGUAGES CXX)
set(TARGET_DEVICE "cuda" CACHE STRING "Target device backend for kernel")
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
if(DEFINED Python_EXECUTABLE)
# Allow passing through the interpreter (e.g. from setup.py).
find_package(Python COMPONENTS Development Development.SABIModule Interpreter)
if (NOT Python_FOUND)
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
endif()
else()
find_package(Python REQUIRED COMPONENTS Development Development.SABIModule Interpreter)
endif()
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
find_package(Torch REQUIRED)
if (NOT TARGET_DEVICE STREQUAL "cuda" AND
NOT TARGET_DEVICE STREQUAL "rocm")
return()
endif()
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0+PTX")
else()
set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX")
endif()
if (NOT HIP_FOUND AND CUDA_FOUND)
set(GPU_LANG "CUDA")
elseif(HIP_FOUND)
set(GPU_LANG "HIP")
# Importing torch recognizes and sets up some HIP/ROCm configuration but does
# not let cmake recognize .hip files. In order to get cmake to understand the
# .hip extension automatically, HIP must be enabled explicitly.
enable_language(HIP)
else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
endif()
if(GPU_LANG STREQUAL "CUDA")
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
add_compile_definitions(CUDA_KERNEL)
elseif(GPU_LANG STREQUAL "HIP")
set(ROCM_ARCHS "${HIP_SUPPORTED_ARCHS}")
# TODO: remove this once we can set specific archs per source file set.
override_gpu_arches(GPU_ARCHES
${GPU_LANG}
"${${GPU_LANG}_SUPPORTED_ARCHS}")
add_compile_definitions(ROCM_KERNEL)
else()
override_gpu_arches(GPU_ARCHES
${GPU_LANG}
"${${GPU_LANG}_SUPPORTED_ARCHS}")
endif()
get_torch_gpu_compiler_flags(TORCH_GPU_FLAGS ${GPU_LANG})
list(APPEND GPU_FLAGS ${TORCH_GPU_FLAGS})
set(TORCH_layer_norm_SRC
torch-ext/torch_binding.cpp torch-ext/torch_binding.h
)
list(APPEND SRC "${TORCH_layer_norm_SRC}")
set(layer_norm_SRC
"layer_norm/ln.h"
"layer_norm/ln_api.cpp"
"layer_norm/ln_bwd_1024.cu"
"layer_norm/ln_bwd_1280.cu"
"layer_norm/ln_bwd_1536.cu"
"layer_norm/ln_bwd_2048.cu"
"layer_norm/ln_bwd_256.cu"
"layer_norm/ln_bwd_2560.cu"
"layer_norm/ln_bwd_3072.cu"
"layer_norm/ln_bwd_4096.cu"
"layer_norm/ln_bwd_512.cu"
"layer_norm/ln_bwd_5120.cu"
"layer_norm/ln_bwd_6144.cu"
"layer_norm/ln_bwd_7168.cu"
"layer_norm/ln_bwd_768.cu"
"layer_norm/ln_bwd_8192.cu"
"layer_norm/ln_bwd_kernels.cuh"
"layer_norm/ln_fwd_1024.cu"
"layer_norm/ln_fwd_1280.cu"
"layer_norm/ln_fwd_1536.cu"
"layer_norm/ln_fwd_2048.cu"
"layer_norm/ln_fwd_256.cu"
"layer_norm/ln_fwd_2560.cu"
"layer_norm/ln_fwd_3072.cu"
"layer_norm/ln_fwd_4096.cu"
"layer_norm/ln_fwd_512.cu"
"layer_norm/ln_fwd_5120.cu"
"layer_norm/ln_fwd_6144.cu"
"layer_norm/ln_fwd_7168.cu"
"layer_norm/ln_fwd_768.cu"
"layer_norm/ln_fwd_8192.cu"
"layer_norm/ln_fwd_kernels.cuh"
"layer_norm/ln_kernel_traits.h"
"layer_norm/ln_parallel_bwd_1024.cu"
"layer_norm/ln_parallel_bwd_1280.cu"
"layer_norm/ln_parallel_bwd_1536.cu"
"layer_norm/ln_parallel_bwd_2048.cu"
"layer_norm/ln_parallel_bwd_256.cu"
"layer_norm/ln_parallel_bwd_2560.cu"
"layer_norm/ln_parallel_bwd_3072.cu"
"layer_norm/ln_parallel_bwd_4096.cu"
"layer_norm/ln_parallel_bwd_512.cu"
"layer_norm/ln_parallel_bwd_5120.cu"
"layer_norm/ln_parallel_bwd_6144.cu"
"layer_norm/ln_parallel_bwd_7168.cu"
"layer_norm/ln_parallel_bwd_768.cu"
"layer_norm/ln_parallel_bwd_8192.cu"
"layer_norm/ln_parallel_fwd_1024.cu"
"layer_norm/ln_parallel_fwd_1280.cu"
"layer_norm/ln_parallel_fwd_1536.cu"
"layer_norm/ln_parallel_fwd_2048.cu"
"layer_norm/ln_parallel_fwd_256.cu"
"layer_norm/ln_parallel_fwd_2560.cu"
"layer_norm/ln_parallel_fwd_3072.cu"
"layer_norm/ln_parallel_fwd_4096.cu"
"layer_norm/ln_parallel_fwd_512.cu"
"layer_norm/ln_parallel_fwd_5120.cu"
"layer_norm/ln_parallel_fwd_6144.cu"
"layer_norm/ln_parallel_fwd_7168.cu"
"layer_norm/ln_parallel_fwd_768.cu"
"layer_norm/ln_parallel_fwd_8192.cu"
"layer_norm/ln_parallel_residual_bwd_kernels.cuh"
"layer_norm/ln_parallel_residual_fwd_kernels.cuh"
"layer_norm/ln_utils.cuh"
"layer_norm/static_switch.h"
)
# TODO: check if CLion support this:
# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
set_source_files_properties(
${layer_norm_SRC}
PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.")
if(GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(layer_norm_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "Capabilities for kernel layer_norm: ${layer_norm_ARCHS}")
set_gencode_flags_for_srcs(SRCS "${layer_norm_SRC}" CUDA_ARCHS "${layer_norm_ARCHS}")
foreach(_KERNEL_SRC ${layer_norm_SRC})
if(_KERNEL_SRC MATCHES ".*\\.cu$")
set_property(
SOURCE ${_KERNEL_SRC}
APPEND PROPERTY
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-O3;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;-U__CUDA_NO_BFLOAT16_OPERATORS__;-U__CUDA_NO_BFLOAT16_CONVERSIONS__;-U__CUDA_NO_BFLOAT162_OPERATORS__;-U__CUDA_NO_BFLOAT162_CONVERSIONS__;--expt-relaxed-constexpr;--expt-extended-lambda;--use_fast_math>"
)
endif()
endforeach()
foreach(_KERNEL_SRC ${layer_norm_SRC})
set_property(
SOURCE ${_KERNEL_SRC}
APPEND PROPERTY
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:-DFLASHATTENTION_DISABLE_PYBIND>"
)
endforeach()
list(APPEND SRC "${layer_norm_SRC}")
endif()
define_gpu_extension_target(
_layer_norm_711aa42_dirty
DESTINATION _layer_norm_711aa42_dirty
LANGUAGE ${GPU_LANG}
SOURCES ${SRC}
COMPILE_FLAGS ${GPU_FLAGS}
ARCHITECTURES ${GPU_ARCHES}
#INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)
target_link_options(_layer_norm_711aa42_dirty PRIVATE -static-libstdc++)