qbhf2's picture
added NvidiaWarp and GarmentCode repos
66c9c8a
# Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import sys
import os
import subprocess
import platform
import warp.config
from warp.utils import ScopedTimer
# return a canonical machine architecture string
# - "x86_64" for x86-64, aka. AMD64, aka. x64
# - "aarch64" for AArch64, aka. ARM64
def machine_architecture() -> str:
machine = platform.machine()
if machine == "x86_64" or machine == "AMD64":
return "x86_64"
if machine == "aarch64" or machine == "arm64":
return "aarch64"
raise RuntimeError(f"Unrecognized machine architecture {machine}")
def run_cmd(cmd, capture=False):
if warp.config.verbose:
print(cmd)
try:
return subprocess.check_output(cmd, shell=True)
except subprocess.CalledProcessError as e:
if e.stdout:
print(e.stdout.decode())
if e.stderr:
print(e.stderr.decode())
raise (e)
# cut-down version of vcvars64.bat that allows using
# custom toolchain locations
def set_msvc_compiler(msvc_path, sdk_path):
if "INCLUDE" not in os.environ:
os.environ["INCLUDE"] = ""
if "LIB" not in os.environ:
os.environ["LIB"] = ""
msvc_path = os.path.abspath(msvc_path)
sdk_path = os.path.abspath(sdk_path)
os.environ["INCLUDE"] += os.pathsep + os.path.join(msvc_path, "include")
os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/winrt")
os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/um")
os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/ucrt")
os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/shared")
os.environ["LIB"] += os.pathsep + os.path.join(msvc_path, "lib/x64")
os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/ucrt/x64")
os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/um/x64")
os.environ["PATH"] += os.pathsep + os.path.join(msvc_path, "bin/HostX64/x64")
os.environ["PATH"] += os.pathsep + os.path.join(sdk_path, "bin/x64")
warp.config.host_compiler = os.path.join(msvc_path, "bin", "HostX64", "x64", "cl.exe")
def find_host_compiler():
if os.name == "nt":
try:
# try and find an installed host compiler (msvc)
# runs vcvars and copies back the build environment
vswhere_path = r"%ProgramFiles(x86)%/Microsoft Visual Studio/Installer/vswhere.exe"
vswhere_path = os.path.expandvars(vswhere_path)
if not os.path.exists(vswhere_path):
return ""
vs_path = run_cmd(f'"{vswhere_path}" -latest -property installationPath').decode().rstrip()
vsvars_path = os.path.join(vs_path, "VC\\Auxiliary\\Build\\vcvars64.bat")
output = run_cmd(f'"{vsvars_path}" && set').decode()
for line in output.splitlines():
pair = line.split("=", 1)
if len(pair) >= 2:
os.environ[pair[0]] = pair[1]
cl_path = run_cmd("where cl.exe").decode("utf-8").rstrip()
cl_version = os.environ["VCToolsVersion"].split(".")
# ensure at least VS2019 version, see list of MSVC versions here https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B
cl_required_major = 14
cl_required_minor = 29
if (
(int(cl_version[0]) < cl_required_major)
or (int(cl_version[0]) == cl_required_major)
and int(cl_version[1]) < cl_required_minor
):
print(
f"Warp: MSVC found but compiler version too old, found {cl_version[0]}.{cl_version[1]}, but must be {cl_required_major}.{cl_required_minor} or higher, kernel host compilation will be disabled."
)
return ""
return cl_path
except Exception as e:
# couldn't find host compiler
return ""
else:
# try and find g++
try:
return run_cmd("which g++").decode()
except:
return ""
def get_cuda_toolkit_version(cuda_home):
try:
# the toolkit version can be obtained by running "nvcc --version"
nvcc_path = os.path.join(cuda_home, "bin", "nvcc")
nvcc_version_output = subprocess.check_output([nvcc_path, "--version"]).decode("utf-8")
# search for release substring (e.g., "release 11.5")
import re
m = re.search(r"(?<=release )\d+\.\d+", nvcc_version_output)
if m is not None:
return tuple(int(x) for x in m.group(0).split("."))
else:
raise Exception("Failed to parse NVCC output")
except Exception as e:
print(f"Failed to determine CUDA Toolkit version: {e}")
def quote(path):
return '"' + path + '"'
def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp=False, fast_math=False, quick=False):
cuda_home = warp.config.cuda_path
cuda_cmd = None
if quick:
cutlass_includes = ""
cutlass_enabled = "WP_ENABLE_CUTLASS=0"
else:
cutlass_home = "warp/native/cutlass"
cutlass_includes = f'-I"{cutlass_home}/include" -I"{cutlass_home}/tools/util/include"'
cutlass_enabled = "WP_ENABLE_CUTLASS=1"
if quick or cu_path is None:
cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
else:
cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"
import pathlib
warp_home_path = pathlib.Path(__file__).parent
warp_home = warp_home_path.resolve()
nanovdb_home = warp_home_path.parent / "_build/host-deps/nanovdb/include"
# output stale, rebuild
if warp.config.verbose:
print(f"Building {dll_path}")
native_dir = os.path.join(warp_home, "native")
if cu_path:
# check CUDA Toolkit version
min_ctk_version = (11, 5)
ctk_version = get_cuda_toolkit_version(cuda_home) or min_ctk_version
if ctk_version < min_ctk_version:
raise Exception(
f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
)
gencode_opts = []
if quick:
# minimum supported architectures (PTX)
gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
else:
# generate code for all supported architectures
gencode_opts += [
# SASS for supported desktop/datacenter architectures
"-gencode=arch=compute_52,code=sm_52", # Maxwell
"-gencode=arch=compute_60,code=sm_60", # Pascal
"-gencode=arch=compute_61,code=sm_61",
"-gencode=arch=compute_70,code=sm_70", # Volta
"-gencode=arch=compute_75,code=sm_75", # Turing
"-gencode=arch=compute_80,code=sm_80", # Ampere
"-gencode=arch=compute_86,code=sm_86",
]
if arch == "aarch64" and sys.platform == "linux":
gencode_opts += [
# SASS for supported mobile architectures (e.g. Tegra/Jetson)
"-gencode=arch=compute_53,code=sm_53", # X1
"-gencode=arch=compute_62,code=sm_62", # X2
"-gencode=arch=compute_72,code=sm_72", # Xavier
"-gencode=arch=compute_87,code=sm_87", # Orin
]
# support for Ada and Hopper is available with CUDA Toolkit 11.8+
if ctk_version >= (11, 8):
gencode_opts += [
"-gencode=arch=compute_89,code=sm_89", # Ada
"-gencode=arch=compute_90,code=sm_90", # Hopper
# PTX for future hardware
"-gencode=arch=compute_90,code=compute_90",
]
else:
gencode_opts += [
# PTX for future hardware
"-gencode=arch=compute_86,code=compute_86",
]
nvcc_opts = gencode_opts + [
"-t0", # multithreaded compilation
"--extended-lambda",
]
if fast_math:
nvcc_opts.append("--use_fast_math")
# is the library being built with CUDA enabled?
cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"
if os.name == "nt":
if warp.config.host_compiler:
host_linker = os.path.join(os.path.dirname(warp.config.host_compiler), "link.exe")
else:
raise RuntimeError("Warp build error: No host compiler was found")
cpp_includes = f' /I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
cpp_includes += f' /I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
cuda_includes = f' /I"{cuda_home}/include"' if cu_path else ""
includes = cpp_includes + cuda_includes
# nvrtc_static.lib is built with /MT and _ITERATOR_DEBUG_LEVEL=0 so if we link it in we must match these options
if cu_path or mode != "debug":
runtime = "/MT"
iter_dbg = "_ITERATOR_DEBUG_LEVEL=0"
debug = "NDEBUG"
else:
runtime = "/MTd"
iter_dbg = "_ITERATOR_DEBUG_LEVEL=2"
debug = "_DEBUG"
if warp.config.mode == "debug":
cpp_flags = f'/nologo {runtime} /Zi /Od /D "{debug}" /D WP_ENABLE_DEBUG=1 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}'
linkopts = ["/DLL", "/DEBUG"]
elif warp.config.mode == "release":
cpp_flags = f'/nologo {runtime} /Ox /D "{debug}" /D WP_ENABLE_DEBUG=0 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}'
linkopts = ["/DLL"]
else:
raise RuntimeError(f"Unrecognized build configuration (debug, release), got: {mode}")
if verify_fp:
cpp_flags += ' /D "WP_VERIFY_FP"'
if fast_math:
cpp_flags += " /fp:fast"
with ScopedTimer("build", active=warp.config.verbose):
for cpp_path in cpp_paths:
cpp_out = cpp_path + ".obj"
linkopts.append(quote(cpp_out))
cpp_cmd = f'"{warp.config.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"'
run_cmd(cpp_cmd)
if cu_path:
cu_out = cu_path + ".o"
if mode == "debug":
cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -I"{nanovdb_home}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
elif mode == "release":
cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -I"{nanovdb_home}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
with ScopedTimer("build_cuda", active=warp.config.verbose):
run_cmd(cuda_cmd)
linkopts.append(quote(cu_out))
linkopts.append(
f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
)
with ScopedTimer("link", active=warp.config.verbose):
link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
run_cmd(link_cmd)
else:
cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
cuda_includes = f' -I"{cuda_home}/include"' if cu_path else ""
includes = cpp_includes + cuda_includes
if sys.platform == "darwin":
target = f"--target={arch}-apple-macos11"
else:
target = ""
if mode == "debug":
cpp_flags = f'{target} -O0 -g -fno-rtti -D_DEBUG -DWP_ENABLE_DEBUG=1 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -fkeep-inline-functions -I"{native_dir}" {includes}'
if mode == "release":
cpp_flags = f'{target} -O3 -DNDEBUG -DWP_ENABLE_DEBUG=0 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes}'
if verify_fp:
cpp_flags += " -DWP_VERIFY_FP"
if fast_math:
cpp_flags += " -ffast-math"
ld_inputs = []
with ScopedTimer("build", active=warp.config.verbose):
for cpp_path in cpp_paths:
cpp_out = cpp_path + ".o"
ld_inputs.append(quote(cpp_out))
build_cmd = f'g++ {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
run_cmd(build_cmd)
if cu_path:
cu_out = cu_path + ".o"
if mode == "debug":
cuda_cmd = f'"{cuda_home}/bin/nvcc" -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
elif mode == "release":
cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
with ScopedTimer("build_cuda", active=warp.config.verbose):
run_cmd(cuda_cmd)
ld_inputs.append(quote(cu_out))
ld_inputs.append(
f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
)
if sys.platform == "darwin":
opt_no_undefined = "-Wl,-undefined,error"
opt_exclude_libs = ""
else:
opt_no_undefined = "-Wl,--no-undefined"
opt_exclude_libs = "-Wl,--exclude-libs,ALL"
with ScopedTimer("link", active=warp.config.verbose):
origin = "@loader_path" if (sys.platform == "darwin") else "$ORIGIN"
link_cmd = f"g++ {target} -shared -Wl,-rpath,'{origin}' {opt_no_undefined} {opt_exclude_libs} -o '{dll_path}' {' '.join(ld_inputs + libs)}"
run_cmd(link_cmd)
# Strip symbols to reduce the binary size
if sys.platform == "darwin":
run_cmd(f"strip -x {dll_path}") # Strip all local symbols
else: # Linux
# Strip all symbols except for those needed to support debugging JIT-compiled code
run_cmd(
f"strip --strip-all --keep-symbol=__jit_debug_register_code --keep-symbol=__jit_debug_descriptor {dll_path}"
)
def build_dll(dll_path, cpp_paths, cu_path, libs=[], mode="release", verify_fp=False, fast_math=False, quick=False):
if sys.platform == "darwin":
# create a universal binary by combining x86-64 and AArch64 builds
build_dll_for_arch(dll_path + "-x86_64", cpp_paths, cu_path, libs, mode, "x86_64", verify_fp, fast_math, quick)
build_dll_for_arch(
dll_path + "-aarch64", cpp_paths, cu_path, libs, mode, "aarch64", verify_fp, fast_math, quick
)
run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64")
os.remove(f"{dll_path}-x86_64")
os.remove(f"{dll_path}-aarch64")
else:
build_dll_for_arch(
dll_path, cpp_paths, cu_path, libs, mode, machine_architecture(), verify_fp, fast_math, quick
)