Spaces:
Sleeping
Sleeping
| # Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved. | |
| # NVIDIA CORPORATION and its licensors retain all intellectual property | |
| # and proprietary rights in and to this software, related documentation | |
| # and any modifications thereto. Any use, reproduction, disclosure or | |
| # distribution of this software and related documentation without an express | |
| # license agreement from NVIDIA CORPORATION is strictly prohibited. | |
| import sys | |
| import os | |
| import subprocess | |
| import platform | |
| import warp.config | |
| from warp.utils import ScopedTimer | |
| # return a canonical machine architecture string | |
| # - "x86_64" for x86-64, aka. AMD64, aka. x64 | |
| # - "aarch64" for AArch64, aka. ARM64 | |
| def machine_architecture() -> str: | |
| machine = platform.machine() | |
| if machine == "x86_64" or machine == "AMD64": | |
| return "x86_64" | |
| if machine == "aarch64" or machine == "arm64": | |
| return "aarch64" | |
| raise RuntimeError(f"Unrecognized machine architecture {machine}") | |
| def run_cmd(cmd, capture=False): | |
| if warp.config.verbose: | |
| print(cmd) | |
| try: | |
| return subprocess.check_output(cmd, shell=True) | |
| except subprocess.CalledProcessError as e: | |
| if e.stdout: | |
| print(e.stdout.decode()) | |
| if e.stderr: | |
| print(e.stderr.decode()) | |
| raise (e) | |
| # cut-down version of vcvars64.bat that allows using | |
| # custom toolchain locations | |
| def set_msvc_compiler(msvc_path, sdk_path): | |
| if "INCLUDE" not in os.environ: | |
| os.environ["INCLUDE"] = "" | |
| if "LIB" not in os.environ: | |
| os.environ["LIB"] = "" | |
| msvc_path = os.path.abspath(msvc_path) | |
| sdk_path = os.path.abspath(sdk_path) | |
| os.environ["INCLUDE"] += os.pathsep + os.path.join(msvc_path, "include") | |
| os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/winrt") | |
| os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/um") | |
| os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/ucrt") | |
| os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/shared") | |
| os.environ["LIB"] += os.pathsep + os.path.join(msvc_path, "lib/x64") | |
| os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/ucrt/x64") | |
| os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/um/x64") | |
| os.environ["PATH"] += os.pathsep + os.path.join(msvc_path, "bin/HostX64/x64") | |
| os.environ["PATH"] += os.pathsep + os.path.join(sdk_path, "bin/x64") | |
| warp.config.host_compiler = os.path.join(msvc_path, "bin", "HostX64", "x64", "cl.exe") | |
| def find_host_compiler(): | |
| if os.name == "nt": | |
| try: | |
| # try and find an installed host compiler (msvc) | |
| # runs vcvars and copies back the build environment | |
| vswhere_path = r"%ProgramFiles(x86)%/Microsoft Visual Studio/Installer/vswhere.exe" | |
| vswhere_path = os.path.expandvars(vswhere_path) | |
| if not os.path.exists(vswhere_path): | |
| return "" | |
| vs_path = run_cmd(f'"{vswhere_path}" -latest -property installationPath').decode().rstrip() | |
| vsvars_path = os.path.join(vs_path, "VC\\Auxiliary\\Build\\vcvars64.bat") | |
| output = run_cmd(f'"{vsvars_path}" && set').decode() | |
| for line in output.splitlines(): | |
| pair = line.split("=", 1) | |
| if len(pair) >= 2: | |
| os.environ[pair[0]] = pair[1] | |
| cl_path = run_cmd("where cl.exe").decode("utf-8").rstrip() | |
| cl_version = os.environ["VCToolsVersion"].split(".") | |
| # ensure at least VS2019 version, see list of MSVC versions here https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B | |
| cl_required_major = 14 | |
| cl_required_minor = 29 | |
| if ( | |
| (int(cl_version[0]) < cl_required_major) | |
| or (int(cl_version[0]) == cl_required_major) | |
| and int(cl_version[1]) < cl_required_minor | |
| ): | |
| print( | |
| f"Warp: MSVC found but compiler version too old, found {cl_version[0]}.{cl_version[1]}, but must be {cl_required_major}.{cl_required_minor} or higher, kernel host compilation will be disabled." | |
| ) | |
| return "" | |
| return cl_path | |
| except Exception as e: | |
| # couldn't find host compiler | |
| return "" | |
| else: | |
| # try and find g++ | |
| try: | |
| return run_cmd("which g++").decode() | |
| except: | |
| return "" | |
| def get_cuda_toolkit_version(cuda_home): | |
| try: | |
| # the toolkit version can be obtained by running "nvcc --version" | |
| nvcc_path = os.path.join(cuda_home, "bin", "nvcc") | |
| nvcc_version_output = subprocess.check_output([nvcc_path, "--version"]).decode("utf-8") | |
| # search for release substring (e.g., "release 11.5") | |
| import re | |
| m = re.search(r"(?<=release )\d+\.\d+", nvcc_version_output) | |
| if m is not None: | |
| return tuple(int(x) for x in m.group(0).split(".")) | |
| else: | |
| raise Exception("Failed to parse NVCC output") | |
| except Exception as e: | |
| print(f"Failed to determine CUDA Toolkit version: {e}") | |
| def quote(path): | |
| return '"' + path + '"' | |
| def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp=False, fast_math=False, quick=False): | |
| cuda_home = warp.config.cuda_path | |
| cuda_cmd = None | |
| if quick: | |
| cutlass_includes = "" | |
| cutlass_enabled = "WP_ENABLE_CUTLASS=0" | |
| else: | |
| cutlass_home = "warp/native/cutlass" | |
| cutlass_includes = f'-I"{cutlass_home}/include" -I"{cutlass_home}/tools/util/include"' | |
| cutlass_enabled = "WP_ENABLE_CUTLASS=1" | |
| if quick or cu_path is None: | |
| cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0" | |
| else: | |
| cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1" | |
| import pathlib | |
| warp_home_path = pathlib.Path(__file__).parent | |
| warp_home = warp_home_path.resolve() | |
| nanovdb_home = warp_home_path.parent / "_build/host-deps/nanovdb/include" | |
| # output stale, rebuild | |
| if warp.config.verbose: | |
| print(f"Building {dll_path}") | |
| native_dir = os.path.join(warp_home, "native") | |
| if cu_path: | |
| # check CUDA Toolkit version | |
| min_ctk_version = (11, 5) | |
| ctk_version = get_cuda_toolkit_version(cuda_home) or min_ctk_version | |
| if ctk_version < min_ctk_version: | |
| raise Exception( | |
| f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})" | |
| ) | |
| gencode_opts = [] | |
| if quick: | |
| # minimum supported architectures (PTX) | |
| gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"] | |
| else: | |
| # generate code for all supported architectures | |
| gencode_opts += [ | |
| # SASS for supported desktop/datacenter architectures | |
| "-gencode=arch=compute_52,code=sm_52", # Maxwell | |
| "-gencode=arch=compute_60,code=sm_60", # Pascal | |
| "-gencode=arch=compute_61,code=sm_61", | |
| "-gencode=arch=compute_70,code=sm_70", # Volta | |
| "-gencode=arch=compute_75,code=sm_75", # Turing | |
| "-gencode=arch=compute_80,code=sm_80", # Ampere | |
| "-gencode=arch=compute_86,code=sm_86", | |
| ] | |
| if arch == "aarch64" and sys.platform == "linux": | |
| gencode_opts += [ | |
| # SASS for supported mobile architectures (e.g. Tegra/Jetson) | |
| "-gencode=arch=compute_53,code=sm_53", # X1 | |
| "-gencode=arch=compute_62,code=sm_62", # X2 | |
| "-gencode=arch=compute_72,code=sm_72", # Xavier | |
| "-gencode=arch=compute_87,code=sm_87", # Orin | |
| ] | |
| # support for Ada and Hopper is available with CUDA Toolkit 11.8+ | |
| if ctk_version >= (11, 8): | |
| gencode_opts += [ | |
| "-gencode=arch=compute_89,code=sm_89", # Ada | |
| "-gencode=arch=compute_90,code=sm_90", # Hopper | |
| # PTX for future hardware | |
| "-gencode=arch=compute_90,code=compute_90", | |
| ] | |
| else: | |
| gencode_opts += [ | |
| # PTX for future hardware | |
| "-gencode=arch=compute_86,code=compute_86", | |
| ] | |
| nvcc_opts = gencode_opts + [ | |
| "-t0", # multithreaded compilation | |
| "--extended-lambda", | |
| ] | |
| if fast_math: | |
| nvcc_opts.append("--use_fast_math") | |
| # is the library being built with CUDA enabled? | |
| cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0" | |
| if os.name == "nt": | |
| if warp.config.host_compiler: | |
| host_linker = os.path.join(os.path.dirname(warp.config.host_compiler), "link.exe") | |
| else: | |
| raise RuntimeError("Warp build error: No host compiler was found") | |
| cpp_includes = f' /I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"' | |
| cpp_includes += f' /I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"' | |
| cuda_includes = f' /I"{cuda_home}/include"' if cu_path else "" | |
| includes = cpp_includes + cuda_includes | |
| # nvrtc_static.lib is built with /MT and _ITERATOR_DEBUG_LEVEL=0 so if we link it in we must match these options | |
| if cu_path or mode != "debug": | |
| runtime = "/MT" | |
| iter_dbg = "_ITERATOR_DEBUG_LEVEL=0" | |
| debug = "NDEBUG" | |
| else: | |
| runtime = "/MTd" | |
| iter_dbg = "_ITERATOR_DEBUG_LEVEL=2" | |
| debug = "_DEBUG" | |
| if warp.config.mode == "debug": | |
| cpp_flags = f'/nologo {runtime} /Zi /Od /D "{debug}" /D WP_ENABLE_DEBUG=1 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}' | |
| linkopts = ["/DLL", "/DEBUG"] | |
| elif warp.config.mode == "release": | |
| cpp_flags = f'/nologo {runtime} /Ox /D "{debug}" /D WP_ENABLE_DEBUG=0 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}' | |
| linkopts = ["/DLL"] | |
| else: | |
| raise RuntimeError(f"Unrecognized build configuration (debug, release), got: {mode}") | |
| if verify_fp: | |
| cpp_flags += ' /D "WP_VERIFY_FP"' | |
| if fast_math: | |
| cpp_flags += " /fp:fast" | |
| with ScopedTimer("build", active=warp.config.verbose): | |
| for cpp_path in cpp_paths: | |
| cpp_out = cpp_path + ".obj" | |
| linkopts.append(quote(cpp_out)) | |
| cpp_cmd = f'"{warp.config.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"' | |
| run_cmd(cpp_cmd) | |
| if cu_path: | |
| cu_out = cu_path + ".o" | |
| if mode == "debug": | |
| cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -I"{nanovdb_home}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' | |
| elif mode == "release": | |
| cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -I"{nanovdb_home}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' | |
| with ScopedTimer("build_cuda", active=warp.config.verbose): | |
| run_cmd(cuda_cmd) | |
| linkopts.append(quote(cu_out)) | |
| linkopts.append( | |
| f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"' | |
| ) | |
| with ScopedTimer("link", active=warp.config.verbose): | |
| link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"' | |
| run_cmd(link_cmd) | |
| else: | |
| cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"' | |
| cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"' | |
| cuda_includes = f' -I"{cuda_home}/include"' if cu_path else "" | |
| includes = cpp_includes + cuda_includes | |
| if sys.platform == "darwin": | |
| target = f"--target={arch}-apple-macos11" | |
| else: | |
| target = "" | |
| if mode == "debug": | |
| cpp_flags = f'{target} -O0 -g -fno-rtti -D_DEBUG -DWP_ENABLE_DEBUG=1 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -fkeep-inline-functions -I"{native_dir}" {includes}' | |
| if mode == "release": | |
| cpp_flags = f'{target} -O3 -DNDEBUG -DWP_ENABLE_DEBUG=0 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes}' | |
| if verify_fp: | |
| cpp_flags += " -DWP_VERIFY_FP" | |
| if fast_math: | |
| cpp_flags += " -ffast-math" | |
| ld_inputs = [] | |
| with ScopedTimer("build", active=warp.config.verbose): | |
| for cpp_path in cpp_paths: | |
| cpp_out = cpp_path + ".o" | |
| ld_inputs.append(quote(cpp_out)) | |
| build_cmd = f'g++ {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"' | |
| run_cmd(build_cmd) | |
| if cu_path: | |
| cu_out = cu_path + ".o" | |
| if mode == "debug": | |
| cuda_cmd = f'"{cuda_home}/bin/nvcc" -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' | |
| elif mode == "release": | |
| cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"' | |
| with ScopedTimer("build_cuda", active=warp.config.verbose): | |
| run_cmd(cuda_cmd) | |
| ld_inputs.append(quote(cu_out)) | |
| ld_inputs.append( | |
| f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt' | |
| ) | |
| if sys.platform == "darwin": | |
| opt_no_undefined = "-Wl,-undefined,error" | |
| opt_exclude_libs = "" | |
| else: | |
| opt_no_undefined = "-Wl,--no-undefined" | |
| opt_exclude_libs = "-Wl,--exclude-libs,ALL" | |
| with ScopedTimer("link", active=warp.config.verbose): | |
| origin = "@loader_path" if (sys.platform == "darwin") else "$ORIGIN" | |
| link_cmd = f"g++ {target} -shared -Wl,-rpath,'{origin}' {opt_no_undefined} {opt_exclude_libs} -o '{dll_path}' {' '.join(ld_inputs + libs)}" | |
| run_cmd(link_cmd) | |
| # Strip symbols to reduce the binary size | |
| if sys.platform == "darwin": | |
| run_cmd(f"strip -x {dll_path}") # Strip all local symbols | |
| else: # Linux | |
| # Strip all symbols except for those needed to support debugging JIT-compiled code | |
| run_cmd( | |
| f"strip --strip-all --keep-symbol=__jit_debug_register_code --keep-symbol=__jit_debug_descriptor {dll_path}" | |
| ) | |
| def build_dll(dll_path, cpp_paths, cu_path, libs=[], mode="release", verify_fp=False, fast_math=False, quick=False): | |
| if sys.platform == "darwin": | |
| # create a universal binary by combining x86-64 and AArch64 builds | |
| build_dll_for_arch(dll_path + "-x86_64", cpp_paths, cu_path, libs, mode, "x86_64", verify_fp, fast_math, quick) | |
| build_dll_for_arch( | |
| dll_path + "-aarch64", cpp_paths, cu_path, libs, mode, "aarch64", verify_fp, fast_math, quick | |
| ) | |
| run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64") | |
| os.remove(f"{dll_path}-x86_64") | |
| os.remove(f"{dll_path}-aarch64") | |
| else: | |
| build_dll_for_arch( | |
| dll_path, cpp_paths, cu_path, libs, mode, machine_architecture(), verify_fp, fast_math, quick | |
| ) | |