Spaces:

qbhf2
/

GarmentCode

Sleeping

App Files Files Community

GarmentCode / NvidiaWarp-GarmentCode /warp /build_dll.py

qbhf2

added NvidiaWarp and GarmentCode repos

66c9c8a 11 months ago

raw

history blame contribute delete

16.5 kB

	# Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
	# NVIDIA CORPORATION and its licensors retain all intellectual property
	# and proprietary rights in and to this software, related documentation
	# and any modifications thereto. Any use, reproduction, disclosure or
	# distribution of this software and related documentation without an express
	# license agreement from NVIDIA CORPORATION is strictly prohibited.

	import sys
	import os
	import subprocess
	import platform

	import warp.config
	from warp.utils import ScopedTimer


	# return a canonical machine architecture string
	# - "x86_64" for x86-64, aka. AMD64, aka. x64
	# - "aarch64" for AArch64, aka. ARM64
	def machine_architecture() -> str:
	machine = platform.machine()
	if machine == "x86_64" or machine == "AMD64":
	return "x86_64"
	if machine == "aarch64" or machine == "arm64":
	return "aarch64"
	raise RuntimeError(f"Unrecognized machine architecture {machine}")


	def run_cmd(cmd, capture=False):
	if warp.config.verbose:
	print(cmd)

	try:
	return subprocess.check_output(cmd, shell=True)
	except subprocess.CalledProcessError as e:
	if e.stdout:
	print(e.stdout.decode())
	if e.stderr:
	print(e.stderr.decode())
	raise (e)


	# cut-down version of vcvars64.bat that allows using
	# custom toolchain locations
	def set_msvc_compiler(msvc_path, sdk_path):
	if "INCLUDE" not in os.environ:
	os.environ["INCLUDE"] = ""

	if "LIB" not in os.environ:
	os.environ["LIB"] = ""

	msvc_path = os.path.abspath(msvc_path)
	sdk_path = os.path.abspath(sdk_path)

	os.environ["INCLUDE"] += os.pathsep + os.path.join(msvc_path, "include")
	os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/winrt")
	os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/um")
	os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/ucrt")
	os.environ["INCLUDE"] += os.pathsep + os.path.join(sdk_path, "include/shared")

	os.environ["LIB"] += os.pathsep + os.path.join(msvc_path, "lib/x64")
	os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/ucrt/x64")
	os.environ["LIB"] += os.pathsep + os.path.join(sdk_path, "lib/um/x64")

	os.environ["PATH"] += os.pathsep + os.path.join(msvc_path, "bin/HostX64/x64")
	os.environ["PATH"] += os.pathsep + os.path.join(sdk_path, "bin/x64")

	warp.config.host_compiler = os.path.join(msvc_path, "bin", "HostX64", "x64", "cl.exe")


	def find_host_compiler():
	if os.name == "nt":
	try:
	# try and find an installed host compiler (msvc)
	# runs vcvars and copies back the build environment

	vswhere_path = r"%ProgramFiles(x86)%/Microsoft Visual Studio/Installer/vswhere.exe"
	vswhere_path = os.path.expandvars(vswhere_path)
	if not os.path.exists(vswhere_path):
	return ""

	vs_path = run_cmd(f'"{vswhere_path}" -latest -property installationPath').decode().rstrip()
	vsvars_path = os.path.join(vs_path, "VC\\Auxiliary\\Build\\vcvars64.bat")

	output = run_cmd(f'"{vsvars_path}" && set').decode()

	for line in output.splitlines():
	pair = line.split("=", 1)
	if len(pair) >= 2:
	os.environ[pair[0]] = pair[1]

	cl_path = run_cmd("where cl.exe").decode("utf-8").rstrip()
	cl_version = os.environ["VCToolsVersion"].split(".")

	# ensure at least VS2019 version, see list of MSVC versions here https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B
	cl_required_major = 14
	cl_required_minor = 29

	if (
	(int(cl_version[0]) < cl_required_major)
	or (int(cl_version[0]) == cl_required_major)
	and int(cl_version[1]) < cl_required_minor
	):
	print(
	f"Warp: MSVC found but compiler version too old, found {cl_version[0]}.{cl_version[1]}, but must be {cl_required_major}.{cl_required_minor} or higher, kernel host compilation will be disabled."
	)
	return ""

	return cl_path

	except Exception as e:
	# couldn't find host compiler
	return ""
	else:
	# try and find g++
	try:
	return run_cmd("which g++").decode()
	except:
	return ""


	def get_cuda_toolkit_version(cuda_home):
	try:
	# the toolkit version can be obtained by running "nvcc --version"
	nvcc_path = os.path.join(cuda_home, "bin", "nvcc")
	nvcc_version_output = subprocess.check_output([nvcc_path, "--version"]).decode("utf-8")
	# search for release substring (e.g., "release 11.5")
	import re

	m = re.search(r"(?<=release )\d+\.\d+", nvcc_version_output)
	if m is not None:
	return tuple(int(x) for x in m.group(0).split("."))
	else:
	raise Exception("Failed to parse NVCC output")

	except Exception as e:
	print(f"Failed to determine CUDA Toolkit version: {e}")


	def quote(path):
	return '"' + path + '"'


	def build_dll_for_arch(dll_path, cpp_paths, cu_path, libs, mode, arch, verify_fp=False, fast_math=False, quick=False):
	cuda_home = warp.config.cuda_path
	cuda_cmd = None

	if quick:
	cutlass_includes = ""
	cutlass_enabled = "WP_ENABLE_CUTLASS=0"
	else:
	cutlass_home = "warp/native/cutlass"
	cutlass_includes = f'-I"{cutlass_home}/include" -I"{cutlass_home}/tools/util/include"'
	cutlass_enabled = "WP_ENABLE_CUTLASS=1"

	if quick or cu_path is None:
	cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=0"
	else:
	cuda_compat_enabled = "WP_ENABLE_CUDA_COMPATIBILITY=1"

	import pathlib

	warp_home_path = pathlib.Path(__file__).parent
	warp_home = warp_home_path.resolve()
	nanovdb_home = warp_home_path.parent / "_build/host-deps/nanovdb/include"

	# output stale, rebuild
	if warp.config.verbose:
	print(f"Building {dll_path}")

	native_dir = os.path.join(warp_home, "native")

	if cu_path:
	# check CUDA Toolkit version
	min_ctk_version = (11, 5)
	ctk_version = get_cuda_toolkit_version(cuda_home) or min_ctk_version
	if ctk_version < min_ctk_version:
	raise Exception(
	f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
	)

	gencode_opts = []

	if quick:
	# minimum supported architectures (PTX)
	gencode_opts += ["-gencode=arch=compute_52,code=compute_52", "-gencode=arch=compute_75,code=compute_75"]
	else:
	# generate code for all supported architectures
	gencode_opts += [
	# SASS for supported desktop/datacenter architectures
	"-gencode=arch=compute_52,code=sm_52", # Maxwell
	"-gencode=arch=compute_60,code=sm_60", # Pascal
	"-gencode=arch=compute_61,code=sm_61",
	"-gencode=arch=compute_70,code=sm_70", # Volta
	"-gencode=arch=compute_75,code=sm_75", # Turing
	"-gencode=arch=compute_80,code=sm_80", # Ampere
	"-gencode=arch=compute_86,code=sm_86",
	]
	if arch == "aarch64" and sys.platform == "linux":
	gencode_opts += [
	# SASS for supported mobile architectures (e.g. Tegra/Jetson)
	"-gencode=arch=compute_53,code=sm_53", # X1
	"-gencode=arch=compute_62,code=sm_62", # X2
	"-gencode=arch=compute_72,code=sm_72", # Xavier
	"-gencode=arch=compute_87,code=sm_87", # Orin
	]

	# support for Ada and Hopper is available with CUDA Toolkit 11.8+
	if ctk_version >= (11, 8):
	gencode_opts += [
	"-gencode=arch=compute_89,code=sm_89", # Ada
	"-gencode=arch=compute_90,code=sm_90", # Hopper
	# PTX for future hardware
	"-gencode=arch=compute_90,code=compute_90",
	]
	else:
	gencode_opts += [
	# PTX for future hardware
	"-gencode=arch=compute_86,code=compute_86",
	]

	nvcc_opts = gencode_opts + [
	"-t0", # multithreaded compilation
	"--extended-lambda",
	]

	if fast_math:
	nvcc_opts.append("--use_fast_math")

	# is the library being built with CUDA enabled?
	cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"

	if os.name == "nt":
	if warp.config.host_compiler:
	host_linker = os.path.join(os.path.dirname(warp.config.host_compiler), "link.exe")
	else:
	raise RuntimeError("Warp build error: No host compiler was found")

	cpp_includes = f' /I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
	cpp_includes += f' /I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
	cuda_includes = f' /I"{cuda_home}/include"' if cu_path else ""
	includes = cpp_includes + cuda_includes

	# nvrtc_static.lib is built with /MT and _ITERATOR_DEBUG_LEVEL=0 so if we link it in we must match these options
	if cu_path or mode != "debug":
	runtime = "/MT"
	iter_dbg = "_ITERATOR_DEBUG_LEVEL=0"
	debug = "NDEBUG"
	else:
	runtime = "/MTd"
	iter_dbg = "_ITERATOR_DEBUG_LEVEL=2"
	debug = "_DEBUG"

	if warp.config.mode == "debug":
	cpp_flags = f'/nologo {runtime} /Zi /Od /D "{debug}" /D WP_ENABLE_DEBUG=1 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}'
	linkopts = ["/DLL", "/DEBUG"]
	elif warp.config.mode == "release":
	cpp_flags = f'/nologo {runtime} /Ox /D "{debug}" /D WP_ENABLE_DEBUG=0 /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" /I"{nanovdb_home}" {includes}'
	linkopts = ["/DLL"]
	else:
	raise RuntimeError(f"Unrecognized build configuration (debug, release), got: {mode}")

	if verify_fp:
	cpp_flags += ' /D "WP_VERIFY_FP"'

	if fast_math:
	cpp_flags += " /fp:fast"

	with ScopedTimer("build", active=warp.config.verbose):
	for cpp_path in cpp_paths:
	cpp_out = cpp_path + ".obj"
	linkopts.append(quote(cpp_out))

	cpp_cmd = f'"{warp.config.host_compiler}" {cpp_flags} -c "{cpp_path}" /Fo"{cpp_out}"'
	run_cmd(cpp_cmd)

	if cu_path:
	cu_out = cu_path + ".o"

	if mode == "debug":
	cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -I"{nanovdb_home}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'

	elif mode == "release":
	cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -I"{nanovdb_home}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'

	with ScopedTimer("build_cuda", active=warp.config.verbose):
	run_cmd(cuda_cmd)
	linkopts.append(quote(cu_out))
	linkopts.append(
	f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
	)

	with ScopedTimer("link", active=warp.config.verbose):
	link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
	run_cmd(link_cmd)

	else:
	cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
	cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
	cuda_includes = f' -I"{cuda_home}/include"' if cu_path else ""
	includes = cpp_includes + cuda_includes

	if sys.platform == "darwin":
	target = f"--target={arch}-apple-macos11"
	else:
	target = ""

	if mode == "debug":
	cpp_flags = f'{target} -O0 -g -fno-rtti -D_DEBUG -DWP_ENABLE_DEBUG=1 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -fkeep-inline-functions -I"{native_dir}" {includes}'

	if mode == "release":
	cpp_flags = f'{target} -O3 -DNDEBUG -DWP_ENABLE_DEBUG=0 -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden --std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes}'

	if verify_fp:
	cpp_flags += " -DWP_VERIFY_FP"

	if fast_math:
	cpp_flags += " -ffast-math"

	ld_inputs = []

	with ScopedTimer("build", active=warp.config.verbose):
	for cpp_path in cpp_paths:
	cpp_out = cpp_path + ".o"
	ld_inputs.append(quote(cpp_out))

	build_cmd = f'g++ {cpp_flags} -c "{cpp_path}" -o "{cpp_out}"'
	run_cmd(build_cmd)

	if cu_path:
	cu_out = cu_path + ".o"

	if mode == "debug":
	cuda_cmd = f'"{cuda_home}/bin/nvcc" -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'

	elif mode == "release":
	cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'

	with ScopedTimer("build_cuda", active=warp.config.verbose):
	run_cmd(cuda_cmd)

	ld_inputs.append(quote(cu_out))
	ld_inputs.append(
	f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
	)

	if sys.platform == "darwin":
	opt_no_undefined = "-Wl,-undefined,error"
	opt_exclude_libs = ""
	else:
	opt_no_undefined = "-Wl,--no-undefined"
	opt_exclude_libs = "-Wl,--exclude-libs,ALL"

	with ScopedTimer("link", active=warp.config.verbose):
	origin = "@loader_path" if (sys.platform == "darwin") else "$ORIGIN"
	link_cmd = f"g++ {target} -shared -Wl,-rpath,'{origin}' {opt_no_undefined} {opt_exclude_libs} -o '{dll_path}' {' '.join(ld_inputs + libs)}"
	run_cmd(link_cmd)

	# Strip symbols to reduce the binary size
	if sys.platform == "darwin":
	run_cmd(f"strip -x {dll_path}") # Strip all local symbols
	else: # Linux
	# Strip all symbols except for those needed to support debugging JIT-compiled code
	run_cmd(
	f"strip --strip-all --keep-symbol=__jit_debug_register_code --keep-symbol=__jit_debug_descriptor {dll_path}"
	)


	def build_dll(dll_path, cpp_paths, cu_path, libs=[], mode="release", verify_fp=False, fast_math=False, quick=False):
	if sys.platform == "darwin":
	# create a universal binary by combining x86-64 and AArch64 builds
	build_dll_for_arch(dll_path + "-x86_64", cpp_paths, cu_path, libs, mode, "x86_64", verify_fp, fast_math, quick)
	build_dll_for_arch(
	dll_path + "-aarch64", cpp_paths, cu_path, libs, mode, "aarch64", verify_fp, fast_math, quick
	)

	run_cmd(f"lipo -create -output {dll_path} {dll_path}-x86_64 {dll_path}-aarch64")
	os.remove(f"{dll_path}-x86_64")
	os.remove(f"{dll_path}-aarch64")

	else:
	build_dll_for_arch(
	dll_path, cpp_paths, cu_path, libs, mode, machine_architecture(), verify_fp, fast_math, quick
	)