# Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved. # NVIDIA CORPORATION and its licensors retain all intellectual property # and proprietary rights in and to this software, related documentation # and any modifications thereto. Any use, reproduction, disclosure or # distribution of this software and related documentation without an express # license agreement from NVIDIA CORPORATION is strictly prohibited. import sys import os import subprocess import platform import warp.config from warp.utils import ScopedTimer # return a canonical machine architecture string # - "x86_64" for x86-64, aka. AMD64, aka. x64 # - "aarch64" for AArch64, aka. ARM64 def machine_architecture() -> str: machine = platform.machine() if machine == "x86_64" or machine == "AMD64": return "x86_64" if machine == "aarch64" or machine == "arm64": return "aarch64" raise RuntimeError(f"Unrecognized machine architecture {machine}") def run_cmd(cmd, capture=False): if warp.config.verbose: print(cmd) try: return subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: if e.stdout: print(e.stdout.decode()) if e.stderr: print(e.stderr.decode()) raise (e) # cut-down version of vcvars64.bat that allows using # custom toolchain locations def set_msvc_compiler(msvc_path, sdk_path): if "INCLUDE" not in os.environ: os.environ["INCLUDE"] = "" if "LIB" not in os.environ: os.environ["LIB"] = "" msvc_path = os.path.abspath(msvc_path) sdk_path = os.path.abspath(sdk_path) os.environ["INCLUDE"] += os.pathsep + os.path.join(msvc_path, "include") os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/winrt") os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/um") os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/ucrt") os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/shared") os.environ["LIB"] += os.pathsep + os.path.join(msvc_path, "lib/x64") os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/ucrt/x64") os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/um/x64") os.environ["PATH"] += os.pathsep + os.path.join(msvc_path, "bin/HostX64/x64") os.environ["PATH"] += os.pathsep + os.path.join(sdk_path, "bin/x64") warp.config.host_compiler = os.path.join(msvc_path, "bin", "HostX64", "x64", "cl.exe") def find_host_compiler(): if os.name == "nt": try: # try and find an installed host compiler (msvc) # runs vcvars and copies back the build environment vswhere_path = r"%ProgramFiles(x86)%/Microsoft Visual Studio/Installer/vswhere.exe" vswhere_path = os.path.expandvars(vswhere_path) if not os.path.exists(vswhere_path): return "" vs_path = run_cmd(f'"{vswhere_path}" -latest -property installationPath').decode().rstrip() vsvars_path = os.path.join(vs_path, "VC\\Auxiliary\\Build\\vcvars64.bat") output = run_cmd(f'"{vsvars_path}" && set').decode() for line in output.splitlines(): pair = line.split("=", 1) if len(pair) >= 2: os.environ[pair[0]] = pair[1] cl_path = run_cmd("where cl.exe").decode("utf-8").rstrip() cl_version = os.environ["VCToolsVersion"].split(".") # ensure at least VS2019 version, see list of MSVC versions here https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B cl_required_major = 14 cl_required_minor = 29 if ( (int(cl_version[0]) < cl_required_major) or (int(cl_version[0]) == cl_required_major) and int(cl_version[1]) < cl_required_minor ): print( f"Warp: MSVC found but compiler version too old, found {cl_version[0]}.{cl_version[1]}, but must be {cl_required_major}.{cl_required_minor} or higher, kernel host compilation will be disabled." ) return "" return cl_path except Exception as e: # couldn't find host compiler return "" else: # try and find g++ try: return run_cmd("which g++").decode() except: return "" def get_cuda_toolkit_version(cuda_home): try: # the toolkit version can be obtained by running "nvcc --version" nvcc_path = os.path.join(cuda_home, "bin", "nvcc") nvcc_version_output = subprocess.check_output([nvcc_path, "--version"]).decode("utf-8") # search for release substring (e.g., "release 11.5") import re m = re.search(r"(?<=release )\d+\.\d+", nvcc_version_output) if m is not None: return tuple(int(x) for x in m.group(0).split(".")) else: raise Exception("Failed to parse NVCC output") except Exception as e: print(f"Failed to determine CUDA Toolkit version: {e}") def quote(path): return '"' + path + '"' def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp=False, fast_math=False, quick=False): cuda_home = warp.config.cuda_path cuda_cmd = None if quick: cutlass_includes = "" cutlass_enabled = "WP_ENABLE_CUTLASS=0" else: cutlass_home = "warp/native/cutlass" cutlass_includes = f'-I"{cutlass_home}/include" -I"{cutlass_home}/tools/util/include"' cutlass_enabled = "WP_ENABLE_CUTLASS=1" if quick or cu_path is None: cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0" else: cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1" import pathlib warp_home_path = pathlib.Path(__file__).parent warp_home = warp_home_path.resolve() nanovdb_home = warp_home_path.parent / "_build/host-deps/nanovdb/include" # output stale, rebuild if warp.config.verbose: print(f"Building {dll_path}") native_dir = os.path.join(warp_home, "native") if cu_path: # check CUDA Toolkit version min_ctk_version = (11, 5) ctk_version = get_cuda_toolkit_version(cuda_home) or min_ctk_version if ctk_version < min_ctk_version: raise Exception( f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})" ) gencode_opts = [] if quick: # minimum supported architectures (PTX) gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"] else: # generate code for all supported architectures gencode_opts += [ # SASS for supported desktop/datacenter architectures "-gencode=arch=compute_52,code=sm_52", # Maxwell "-gencode=arch=compute_60,code=sm_60", # Pascal "-gencode=arch=compute_61,code=sm_61", "-gencode=arch=compute_70,code=sm_70", # Volta "-gencode=arch=compute_75,code=sm_75", # Turing "-gencode=arch=compute_80,code=sm_80", # Ampere "-gencode=arch=compute_86,code=sm_86", ] if arch == "aarch64" and sys.platform == "linux": gencode_opts += [ # SASS for supported mobile architectures (e.g. Tegra/Jetson) "-gencode=arch=compute_53,code=sm_53", # X1 "-gencode=arch=compute_62,code=sm_62", # X2 "-gencode=arch=compute_72,code=sm_72", # Xavier "-gencode=arch=compute_87,code=sm_87", # Orin ] # support for Ada and Hopper is available with CUDA Toolkit 11.8+ if ctk_version >= (11, 8): gencode_opts += [ "-gencode=arch=compute_89,code=sm_89", # Ada "-gencode=arch=compute_90,code=sm_90", # Hopper # PTX for future hardware "-gencode=arch=compute_90,code=compute_90", ] else: gencode_opts += [ # PTX for future hardware "-gencode=arch=compute_86,code=compute_86", ] nvcc_opts = gencode_opts + [ "-t0", # multithreaded compilation "--extended-lambda", ] if fast_math: nvcc_opts.append("--use_fast_math") # is the library being built with CUDA enabled? cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0" if os.name == "nt": if warp.config.host_compiler: host_linker = os.path.join(os.path.dirname(warp.config.host_compiler), "link.exe") else: raise RuntimeError("Warp build error: No host compiler was found") cpp_includes = f' /I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"' cpp_includes += f' /I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"' cuda_includes = f' /I"{cuda_home}/include"' if cu_path else "" includes = cpp_includes + cuda_includes # nvrtc_static.lib is built with /MT and _ITERATOR_DEBUG_LEVEL=0 so if we link it in we must match these options if cu_path or mode != "debug": runtime = "/MT" iter_dbg = "_ITERATOR_DEBUG_LEVEL=0" debug = "NDEBUG" else: runtime = "/MTd" iter_dbg = "_ITERATOR_DEBUG_LEVEL=2" debug = "_DEBUG" if warp.config.mode == "debug": cpp_flags = f'/nologo {runtime} /Zi /Od /D "{debug}" /D WP_ENABLE_DEBUG=1 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}' linkopts = ["/DLL", "/DEBUG"] elif warp.config.mode == "release": cpp_flags = f'/nologo {runtime} /Ox /D "{debug}" /D WP_ENABLE_DEBUG=0 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}' linkopts = ["/DLL"] else: raise RuntimeError(f"Unrecognized build configuration (debug, release), got: {mode}") if verify_fp: cpp_flags += ' /D "WP_VERIFY_FP"' if fast_math: cpp_flags += " /fp:fast" with ScopedTimer("build", active=warp.config.verbose): for cpp_path in cpp_paths: cpp_out = cpp_path + ".obj" linkopts.append(quote(cpp_out)) cpp_cmd = f'"{warp.config.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"' run_cmd(cpp_cmd) if cu_path: cu_out = cu_path + ".o" if mode == "debug": cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -I"{nanovdb_home}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' elif mode == "release": cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -I"{nanovdb_home}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' with ScopedTimer("build_cuda", active=warp.config.verbose): run_cmd(cuda_cmd) linkopts.append(quote(cu_out)) linkopts.append( f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"' ) with ScopedTimer("link", active=warp.config.verbose): link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"' run_cmd(link_cmd) else: cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"' cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"' cuda_includes = f' -I"{cuda_home}/include"' if cu_path else "" includes = cpp_includes + cuda_includes if sys.platform == "darwin": target = f"--target={arch}-apple-macos11" else: target = "" if mode == "debug": cpp_flags = f'{target} -O0 -g -fno-rtti -D_DEBUG -DWP_ENABLE_DEBUG=1 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -fkeep-inline-functions -I"{native_dir}" {includes}' if mode == "release": cpp_flags = f'{target} -O3 -DNDEBUG -DWP_ENABLE_DEBUG=0 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes}' if verify_fp: cpp_flags += " -DWP_VERIFY_FP" if fast_math: cpp_flags += " -ffast-math" ld_inputs = [] with ScopedTimer("build", active=warp.config.verbose): for cpp_path in cpp_paths: cpp_out = cpp_path + ".o" ld_inputs.append(quote(cpp_out)) build_cmd = f'g++ {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"' run_cmd(build_cmd) if cu_path: cu_out = cu_path + ".o" if mode == "debug": cuda_cmd = f'"{cuda_home}/bin/nvcc" -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' elif mode == "release": cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' with ScopedTimer("build_cuda", active=warp.config.verbose): run_cmd(cuda_cmd) ld_inputs.append(quote(cu_out)) ld_inputs.append( f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt' ) if sys.platform == "darwin": opt_no_undefined = "-Wl,-undefined,error" opt_exclude_libs = "" else: opt_no_undefined = "-Wl,--no-undefined" opt_exclude_libs = "-Wl,--exclude-libs,ALL" with ScopedTimer("link", active=warp.config.verbose): origin = "@loader_path" if (sys.platform == "darwin") else "$ORIGIN" link_cmd = f"g++ {target} -shared -Wl,-rpath,'{origin}' {opt_no_undefined} {opt_exclude_libs} -o '{dll_path}' {' '.join(ld_inputs + libs)}" run_cmd(link_cmd) # Strip symbols to reduce the binary size if sys.platform == "darwin": run_cmd(f"strip -x {dll_path}") # Strip all local symbols else: # Linux # Strip all symbols except for those needed to support debugging JIT-compiled code run_cmd( f"strip --strip-all --keep-symbol=__jit_debug_register_code --keep-symbol=__jit_debug_descriptor {dll_path}" ) def build_dll(dll_path, cpp_paths, cu_path, libs=[], mode="release", verify_fp=False, fast_math=False, quick=False): if sys.platform == "darwin": # create a universal binary by combining x86-64 and AArch64 builds build_dll_for_arch(dll_path + "-x86_64", cpp_paths, cu_path, libs, mode, "x86_64", verify_fp, fast_math, quick) build_dll_for_arch( dll_path + "-aarch64", cpp_paths, cu_path, libs, mode, "aarch64", verify_fp, fast_math, quick ) run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64") os.remove(f"{dll_path}-x86_64") os.remove(f"{dll_path}-aarch64") else: build_dll_for_arch( dll_path, cpp_paths, cu_path, libs, mode, machine_architecture(), verify_fp, fast_math, quick )