diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py b/.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py new file mode 100644 index 0000000000000000000000000000000000000000..bdf2f863b3d6d67ad997a25fb24e954c9966a1b5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py @@ -0,0 +1,262 @@ +from triton.backends.compiler import BaseBackend, GPUTarget +from triton._C.libtriton import ir, passes, llvm, amd +from dataclasses import dataclass +from typing import Any, Tuple +import hashlib +import tempfile +import os +import re +import subprocess +import functools +from pathlib import Path + + +@dataclass(frozen=True) +class HIPOptions: + num_warps: int = 4 + waves_per_eu: int = 1 + num_stages: int = 0 + num_ctas: int = 1 + extern_libs: dict = None + cluster_dims: tuple = (1, 1, 1) + debug: bool = False + arch: str = None + allow_fp8e4nv: bool = False + allow_fp8e4b15: bool = False + default_dot_input_precision: str = "ieee" + allowed_dot_input_precisions: Tuple[str] = ("ieee", ) + enable_fp_fusion: bool = True + matrix_instr_nonkdim: int = 0 + kpack: int = 1 + allow_flush_denorm: bool = False + max_num_imprecise_acc_default: int = 0 + backend_name: str = 'hip' + + def __post_init__(self): + default_libdir = Path(__file__).parent / 'lib' + extern_libs = {} if self.extern_libs is None else dict(self.extern_libs) + # Ignore user-defined warp size for gfx9 + warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch else 64 + object.__setattr__(self, 'warp_size', warp_size) + libs = ["ocml", "ockl"] + for lib in libs: + extern_libs[lib] = str(default_libdir / f'{lib}.bc') + object.__setattr__(self, 'extern_libs', tuple(extern_libs.items())) + assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \ + "num_warps must be a power of 2" + + def hash(self): + key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()]) + return hashlib.sha256(key.encode("utf-8")).hexdigest() + + +class HIPBackend(BaseBackend): + + @staticmethod + def supports_target(target: GPUTarget): + return target.backend == 'hip' + + def __init__(self, target: GPUTarget) -> None: + super().__init__(target) + assert isinstance(target.arch, str) + self.binary_ext = "hsaco" + + def parse_options(self, opts) -> Any: + args = {'arch': self.target.arch} + args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts}) + return HIPOptions(**args) + + def pack_metadata(self, metadata): + return ( + metadata.num_warps, + metadata.num_ctas, + metadata.shared, + metadata.cluster_dims[0], + metadata.cluster_dims[1], + metadata.cluster_dims[2], + ) + + def get_codegen_implementation(self): + codegen_fns = dict() + return codegen_fns + + def load_dialects(self, ctx): + amd.load_dialects(ctx) + + @staticmethod + def path_to_rocm_lld(): + # Check env path for ld.lld + lld_env_path = os.getenv("TRITON_HIP_LLD_PATH") + if lld_env_path is not None: + lld = Path(lld_env_path) + if lld.is_file(): + return lld + # Check backend for ld.lld (used for pytorch wheels) + lld = Path(__file__).parent / "llvm/bin/ld.lld" + if lld.is_file(): + return lld + lld = Path("/opt/rocm/llvm/bin/ld.lld") + if lld.is_file(): + return lld + lld = Path("/usr/bin/ld.lld") + if lld.is_file(): + return lld + raise Exception("ROCm linker /opt/rocm/llvm/bin/ld.lld not found") + + @staticmethod + def make_ttir(mod, metadata, options): + pm = ir.pass_manager(mod.context) + pm.enable_debug() + passes.common.add_inliner(pm) + passes.ttir.add_rewrite_tensor_pointer(pm) + passes.ttir.add_combine(pm) + passes.common.add_canonicalizer(pm) + passes.ttir.add_reorder_broadcast(pm) + passes.common.add_cse(pm) + passes.common.add_licm(pm) + passes.common.add_symbol_dce(pm) + pm.run(mod) + return mod + + @staticmethod + def make_ttgir(mod, metadata, options): + pm = ir.pass_manager(mod.context) + pm.enable_debug() + passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size, + options.num_ctas) + pm.run(mod) + pm = ir.pass_manager(mod.context) + pm.enable_debug() + passes.ttgpuir.add_coalesce(pm) + passes.ttgpuir.add_remove_layout_conversions(pm) + passes.ttgpuir.add_optimize_thread_locality(pm) + amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack) + passes.ttgpuir.add_remove_layout_conversions(pm) + amd.passes.ttgpuir.add_optimize_epilogue(pm) + passes.ttgpuir.add_optimize_dot_operands(pm, True) + if options.num_stages == 0 and amd.has_matrix_core_feature(options.arch): + amd.passes.ttgpuir.add_stream_pipeline(pm) + passes.common.add_canonicalizer(pm) + passes.ttgpuir.add_optimize_dot_operands(pm, True) + passes.ttgpuir.add_remove_layout_conversions(pm) + passes.ttgpuir.add_reduce_data_duplication(pm) + if options.num_stages != 0: + amd.passes.ttgpuir.add_reorder_instructions(pm) + passes.common.add_cse(pm) + passes.common.add_symbol_dce(pm) + pm.run(mod) + return mod + + @staticmethod + def make_llir(src, metadata, options): + mod = src + # TritonGPU -> LLVM-IR (MLIR) + pm = ir.pass_manager(mod.context) + pm.enable_debug() + amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch) + passes.convert.add_scf_to_cf(pm) + passes.convert.add_index_to_llvmir(pm) + + passes.ttgpuir.add_allocate_shared_memory(pm) + ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows: + ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless + ## of the value of kernel arg `allow_flush_denorm`. + ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output + ## depends on the value of kernel arg `allow_flush_denorm`. + ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument. + ## For now it is used as a controller for developers only. + __HIP_FTZ = True + amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ) + passes.common.add_canonicalizer(pm) + passes.common.add_cse(pm) + + passes.convert.add_cf_to_llvmir(pm) + passes.convert.add_arith_to_llvmir(pm) + passes.common.add_canonicalizer(pm) + passes.common.add_cse(pm) + passes.common.add_symbol_dce(pm) + if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0": + passes.llvmir.add_di_scope(pm) + # This pass (`add_builtin_func_to_llvmir`) serves as a temporary workaround to address the issue of excessive basic block + # count caused by predicated loads/stores. In certain kernels, the addition of these blocks can cause the MLIR + # canonicalizer to never finish when attempting to merge blocks. The permanent solution under consideration + # involves using MUBUF instructions that have built-in out-of-bounds checks, which would eliminate the need + # for conditional branching around memory accesses. + amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm) + pm.run(mod) + + # LLVM-IR (MLIR) -> LLVM-IR (LLVM) + llvm.init_targets() + context = llvm.context() + llvm_mod = llvm.to_module(mod, context) + + # Set various control constants on the LLVM module so that device + # libraries can resolve references to them. + amd.set_isa_version(llvm_mod, options.arch) + amd.set_abi_version(llvm_mod, 400) + amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False) + amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True) + amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False) + amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64) + + # Set kernel attributes first given this may affect later optimizations. + fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()] + # The public kernel should be kernel 0. + fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL) + fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}") + fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}") + denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee" + fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode) + + if options.extern_libs: + paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)] + llvm.link_extern_libs(llvm_mod, paths) + + llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, amd.TARGET_TRIPLE) + + # Get some metadata + metadata["shared"] = src.get_int_attr("triton_gpu.shared") + + amd.cleanup_bitcode_metadata(llvm_mod) + return str(llvm_mod) + + @staticmethod + def make_amdgcn(src, metadata, options): + # Find kernel names (there should only be one) + # We get the name at the last possible step to accomodate `triton.compile` + # on user-provided LLVM + names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src) + assert len(names) == 1 + metadata["name"] = names[0] + # llvm -> hsaco + amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', [], options.enable_fp_fusion, False) + if os.environ.get("AMDGCN_ENABLE_DUMP", "0") == "1": + print("// -----// AMDGCN Dump //----- //") + print(amdgcn) + return amdgcn + + @staticmethod + def make_hsaco(src, metadata, options): + hsaco = amd.assemble_amdgcn(src, options.arch, '') + + rocm_path = HIPBackend.path_to_rocm_lld() + with tempfile.NamedTemporaryFile() as tmp_out: + with tempfile.NamedTemporaryFile() as tmp_in: + with open(tmp_in.name, 'wb') as fd_in: + fd_in.write(hsaco) + subprocess.check_call([rocm_path, '-flavor', 'gnu', '-shared', tmp_in.name, '-o', tmp_out.name]) + with open(tmp_out.name, 'rb') as fd_out: + ret = fd_out.read() + return ret + + def add_stages(self, stages, options): + stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options) + stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options) + stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options) + stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options) + stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options) + + @functools.lru_cache() + def hash(self): + version = subprocess.check_output([HIPBackend.path_to_rocm_lld(), "--version"], encoding='utf-8') + return f'{version}-{self.target}' diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c new file mode 100644 index 0000000000000000000000000000000000000000..233613a55411162b0ff2b74e4b2157165c35e59f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c @@ -0,0 +1,211 @@ +#define __HIP_PLATFORM_AMD__ +// clang-format off +// hip_depreated.h needs definitions from hip_runtime.h. +#include +#include +// clang-format on +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include + +// The list of paths to search for the HIP runtime library. The caller Python +// code should substitute the search path placeholder. +static const char *hipLibSearchPaths[] = {"/*py_libhip_search_path*/"}; + +// The list of HIP dynamic library symbols and their signature we are interested +// in this file. +// |FOR_EACH_ERR_FN| is a macro to process APIs that return hipError_t; +// |FOR_EACH_STR_FN| is a macro to process APIs that return const char *. +// +// HIP 6.0 introduced an updated hipGetDeviceProperties API under a new symbol, +// hipGetDevicePropertiesR0600. However, the associated hipDeviceProp_t was +// directly updated with breaking changes to match hipGetDevicePropertiesR0600 +// in the header file. We include the header file from HIP 6.0. So here if we +// use hipGetDeviceProperties together with hipDeviceProp_t we will use the +// old API with a new struct definition and mess up the interpretation. +// +// This is a known issue: https://github.com/ROCm/ROCm/issues/2728. +// +// For now explicitly defer to the old hipDeviceProp_t struct. This should work +// for both 5.x and 6.x. In the long term we need to switch to use +// hipGetProcAddress once available: +// https://github.com/ROCm/clr/commit/0479cdb3dd30ef58718cad44e424bd793c394cc0 +#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN) \ + FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError) \ + FOR_EACH_ERR_FN(hipGetDeviceProperties, hipDeviceProp_tR0000 *prop, \ + int deviceId) \ + FOR_EACH_ERR_FN(hipModuleLoadDataEx, hipModule_t *module, const void *image, \ + unsigned int numOptions, hipJitOption *options, \ + void **optionValues) \ + FOR_EACH_ERR_FN(hipModuleGetFunction, hipFunction_t *function, \ + hipModule_t module, const char *kname) \ + FOR_EACH_ERR_FN(hipFuncGetAttribute, int *, hipFunction_attribute attr, \ + hipFunction_t function) + +// The HIP symbol table for holding resolved dynamic library symbols. +struct HIPSymbolTable { +#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...) \ + hipError_t (*hipSymbolName)(__VA_ARGS__); +#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...) \ + const char *(*hipSymbolName)(__VA_ARGS__); + + HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD) +}; + +static struct HIPSymbolTable hipSymbolTable; + +bool initSymbolTable() { + // Use the HIP runtime library loaded into the existing process if it exits. + void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD); + if (lib) { + // printf("[triton] chosen loaded libamdhip64.so in the process\n"); + } + + // Otherwise, go through the list of search paths to dlopen the first HIP + // driver library. + if (!lib) { + int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]); + for (int i = 0; i < n; ++i) { + void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL); + if (handle) { + lib = handle; + // printf("[triton] chosen %s\n", hipLibSearchPaths[i]); + } + } + } + if (!lib) { + PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so"); + return false; + } + + // Resolve all symbols we are interested in. + dlerror(); // Clear existing errors + const char *error = NULL; +#define QUERY_EACH_FN(hipSymbolName, ...) \ + *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName); \ + error = dlerror(); \ + if (error) { \ + PyErr_SetString(PyExc_RuntimeError, \ + "cannot query " #hipSymbolName " from libamdhip64.so"); \ + dlclose(lib); \ + return false; \ + } + + HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN) + + return true; +} + +static inline void gpuAssert(hipError_t code, const char *file, int line) { + { + if (code != HIP_SUCCESS) { + { + const char *prefix = "Triton Error [HIP]: "; + const char *str = hipSymbolTable.hipGetErrorString(code); + char err[1024] = {0}; + snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str); + PyGILState_STATE gil_state; + gil_state = PyGILState_Ensure(); + PyErr_SetString(PyExc_RuntimeError, err); + PyGILState_Release(gil_state); + } + } + } +} + +#define HIP_CHECK(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + if (PyErr_Occurred()) \ + return NULL; \ + } + +static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { + int device_id; + if (!PyArg_ParseTuple(args, "i", &device_id)) + return NULL; + + hipDeviceProp_tR0000 props; + HIP_CHECK(hipSymbolTable.hipGetDeviceProperties(&props, device_id)); + + // create a struct to hold device properties + return Py_BuildValue( + "{s:i, s:i, s:i, s:i, s:i, s:i, s:s, s:i}", "max_shared_mem", + props.sharedMemPerBlock, "max_num_regs", props.regsPerBlock, + "multiprocessor_count", props.multiProcessorCount, "sm_clock_rate", + props.clockRate, "mem_clock_rate", props.memoryClockRate, "mem_bus_width", + props.memoryBusWidth, "arch", props.gcnArchName, "warpSize", + props.warpSize); +} + +static PyObject *loadBinary(PyObject *self, PyObject *args) { + const char *name; + const char *data; + Py_ssize_t data_size; + int shared; + int device; + if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, + &device)) { + return NULL; + } + + // set HIP options + hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose}; + const unsigned int errbufsize = 8192; + const unsigned int logbufsize = 8192; + char _err[errbufsize]; + char _log[logbufsize]; + void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err, + (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1}; + + // launch HIP Binary + hipModule_t mod; + hipFunction_t fun; + HIP_CHECK(hipSymbolTable.hipModuleLoadDataEx(&mod, data, 5, opt, optval)) + HIP_CHECK(hipSymbolTable.hipModuleGetFunction(&fun, mod, name)); + + // get allocated registers and spilled registers from the function + int n_regs = 0; + int n_spills = 0; + hipSymbolTable.hipFuncGetAttribute(&n_regs, HIP_FUNC_ATTRIBUTE_NUM_REGS, fun); + hipSymbolTable.hipFuncGetAttribute(&n_spills, + HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun); + n_spills /= 4; + if (PyErr_Occurred()) { + return NULL; + } + return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, + n_spills); +} + +static PyMethodDef ModuleMethods[] = { + {"load_binary", loadBinary, METH_VARARGS, + "Load provided hsaco into HIP driver"}, + {"get_device_properties", getDeviceProperties, METH_VARARGS, + "Get the properties for a given device"}, + {NULL, NULL, 0, NULL} // sentinel +}; + +static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils", + NULL, // documentation + -1, // size + ModuleMethods}; + +PyMODINIT_FUNC PyInit_hip_utils(void) { + if (!initSymbolTable()) { + return NULL; + } + + PyObject *m = PyModule_Create(&ModuleDef); + if (m == NULL) { + return NULL; + } + PyModule_AddFunctions(m, ModuleMethods); + + return m; +} diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py new file mode 100644 index 0000000000000000000000000000000000000000..c1ff6e1d65f3fbbdf824b93ad17f80164bc96626 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py @@ -0,0 +1,497 @@ +import functools +import os +import hashlib +import subprocess +import tempfile +from pathlib import Path +from triton.runtime.build import _build +from triton.runtime.cache import get_cache_manager +from triton.backends.compiler import GPUTarget +from triton.backends.driver import GPUDriver + +dirname = os.path.dirname(os.path.realpath(__file__)) +include_dir = [os.path.join(dirname, "include")] + + +def _find_already_mmapped_dylib_on_linux(lib_name): + import platform + if platform.system() != 'Linux': + return None + + # Use dl_iterate_phdr to walk through the list of shared libraries at runtime. + # See https://www.man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html for details. + + import ctypes + from ctypes import c_char, c_int, c_size_t, c_void_p, c_char_p, POINTER + + class DlPhdrInfo(ctypes.Structure): + _fields_ = [ + ('dlpi_addr', c_void_p), + ('dlpi_name', c_char_p), + # We don't care about the remaining fields. + ] + + # callback_t must use POINTER(c_char) to avoid copying. + callback_t = ctypes.CFUNCTYPE(c_int, POINTER(DlPhdrInfo), POINTER(c_size_t), POINTER(c_char)) + + # Load libc and get the dl_iterate_phdr symbol. + try: + dl_iterate_phdr = ctypes.CDLL('libc.so.6').dl_iterate_phdr + except: + return None + # argtypes must use c_char_p to accept create_string_buffer. + dl_iterate_phdr.argtypes = [callback_t, c_char_p] + dl_iterate_phdr.restype = c_int + + max_path_length = 4096 + path = ctypes.create_string_buffer(max_path_length + 1) + + # Define callback to get the loaded dylib path. + def callback(info, size, data): + dlpi_name = info.contents.dlpi_name + p = Path(os.fsdecode(dlpi_name)) + if lib_name in p.name: + # Found the dylib; get its path. + ctypes.memmove(data, dlpi_name, min(max_path_length, len(dlpi_name))) + return 1 + return 0 + + if dl_iterate_phdr(callback_t(callback), path): + return os.fsdecode(ctypes.string_at(path)) + return None + + +@functools.lru_cache() +def _get_path_to_hip_runtime_dylib(): + lib_name = "libamdhip64.so" + + # If we are told explicitly what HIP runtime dynamic library to use, obey that. + env_libhip_path = os.getenv("TRITON_LIBHIP_PATH") + if env_libhip_path: + if env_libhip_path.endswith(lib_name) and os.path.exists(env_libhip_path): + return env_libhip_path + raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}") + + # If the shared object is already mmapped to address space, use it. + mmapped_path = _find_already_mmapped_dylib_on_linux(lib_name) + if mmapped_path: + if os.path.exists(mmapped_path): + return mmapped_path + raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}") + + paths = [] + + import site + # First search the HIP runtime dynamic library packaged with PyTorch. It's very likely + # that we run Triton together with PyTorch. This makes sure we use the same dynamic + # library to avoid version mismatch. + site_packages = site.getsitepackages() + user_site = site.getusersitepackages() + if site.ENABLE_USER_SITE: # ENABLE_USER_SITE is initialized in getusersitepackages() + site_packages = [user_site] + site_packages + for path in site_packages: + path = os.path.join(path, "torch", "lib", lib_name) + if os.path.exists(path): + return path + paths.append(path) + + # Then try to see if developer provides a HIP runtime dynamic library using LD_LIBARAY_PATH. + env_ld_library_path = os.getenv("LD_LIBRARY_PATH") + if env_ld_library_path: + for d in env_ld_library_path.split(":"): + f = os.path.join(d, lib_name) + if os.path.exists(f): + return f + paths.append(f) + + # Afterwards try to search the loader dynamic library resolution paths. + libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() + # each line looks like the following: + # libamdhip64.so.6 (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so.6 + # libamdhip64.so (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so + locs = [line.split()[-1] for line in libs.splitlines() if line.strip().endswith(lib_name)] + for loc in locs: + if os.path.exists(loc): + return loc + paths.append(loc) + + # As a last resort, guess if we have it in some common installation path. + common_install_path = os.path.join('/opt/rocm/lib/', lib_name) + if os.path.exists(common_install_path): + return common_install_path + paths.append(common_install_path) + + raise RuntimeError(f"cannot locate {lib_name} after attempted paths {paths}") + + +def compile_module_from_src(src, name): + key = hashlib.sha256(src.encode("utf-8")).hexdigest() + cache = get_cache_manager(key) + cache_path = cache.get_file(f"{name}.so") + if cache_path is None: + with tempfile.TemporaryDirectory() as tmpdir: + src_path = os.path.join(tmpdir, "main.c") + with open(src_path, "w") as f: + f.write(src) + so = _build(name, src_path, tmpdir, [], include_dir, []) + with open(so, "rb") as f: + cache_path = cache.put(f.read(), f"{name}.so", binary=True) + import importlib.util + spec = importlib.util.spec_from_file_location(name, cache_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +class HIPUtils(object): + + def __new__(cls): + if not hasattr(cls, "instance"): + cls.instance = super(HIPUtils, cls).__new__(cls) + return cls.instance + + def __init__(self): + libhip_path = _get_path_to_hip_runtime_dylib() + src = Path(os.path.join(dirname, "driver.c")).read_text() + # Just do a simple search and replace here instead of templates or format strings. + # This way we don't need to escape-quote C code curly brackets and we can replace + # exactly once. + src = src.replace('/*py_libhip_search_path*/', libhip_path, 1) + mod = compile_module_from_src(src, "hip_utils") + self.load_binary = mod.load_binary + self.get_device_properties = mod.get_device_properties + + +# -------------------- Launcher ---------------------------- +def ty_to_cpp(ty): + if ty[0] == '*': + return "hipDeviceptr_t" + return { + "i1": "int32_t", + "i8": "int8_t", + "i16": "int16_t", + "i32": "int32_t", + "i64": "int64_t", + "u1": "uint32_t", + "u8": "uint8_t", + "u16": "uint16_t", + "u32": "uint32_t", + "u64": "uint64_t", + "fp16": "float", + "bf16": "float", + "fp32": "float", + "f32": "float", + "fp64": "double", + }[ty] + + +def make_launcher(constants, signature, ids, warp_size): + start_desc = len(signature) + #signature = generate_cu_signature(constants, signature, ids) + arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items()) + + def _extracted_type(ty): + if ty[0] == '*': + return "PyObject*" + return { + 'i1': 'int32_t', + 'i8': 'int8_t', + 'i16': 'int16_t', + 'i32': 'int32_t', + 'i64': 'int64_t', + 'u1': 'uint32_t', + 'u8': 'uint8_t', + 'u16': 'uint16_t', + 'u32': 'uint32_t', + 'u64': 'uint64_t', + 'fp16': 'float', + 'bf16': 'float', + 'fp32': 'float', + 'f32': 'float', + 'fp64': 'double', + }[ty] + + def format_of(ty): + return { + "PyObject*": "O", + "float": "f", + "double": "d", + "long": "l", + "int8_t": "b", + "int16_t": "h", + "int32_t": "i", + "int64_t": "l", + "uint8_t": "B", + "uint16_t": "H", + "uint32_t": "I", + "uint64_t": "K", + }[ty] + + args_format = ''.join([format_of(_extracted_type(ty)) for ty in signature.values()]) + format = "iiiKKOOOO" + args_format + args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else '' + + libhip_path = _get_path_to_hip_runtime_dylib() + + # generate glue code + params = [i for i in signature.keys() if i not in constants] + src = f""" +#define __HIP_PLATFORM_AMD__ +#include +#include +#include +#include +#include + +// The list of paths to search for the HIP runtime library. The caller Python +// code should substitute the search path placeholder. +static const char *hipLibSearchPaths[] = {{"{libhip_path}"}}; + +// The list of HIP dynamic library symbols and their signature we are interested +// in this file. +#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN) \\ + FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError) \\ + FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f, \\ + unsigned int gridDimX, unsigned int gridDimY, \\ + unsigned int gridDimZ, unsigned int blockDimX, \\ + unsigned int blockDimY, unsigned int blockDimZ, \\ + unsigned int sharedMemBytes, hipStream_t stream, \\ + void **kernelParams, void **extra) \\ + FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data, \\ + hipPointer_attribute attribute, hipDeviceptr_t ptr) + +// The HIP symbol table for holding resolved dynamic library symbols. +struct HIPSymbolTable {{ +#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...) \\ + hipError_t (*hipSymbolName)(__VA_ARGS__); +#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...) \\ + const char *(*hipSymbolName)(__VA_ARGS__); + + HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD) +}}; + +static struct HIPSymbolTable hipSymbolTable; + +bool initSymbolTable() {{ + // Use the HIP runtime library loaded into the existing process if it exits. + void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD); + if (lib) {{ + // printf("[triton] chosen loaded libamdhip64.so in the process\\n"); + }} + + // Otherwise, go through the list of search paths to dlopen the first HIP + // driver library. + if (!lib) {{ + int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]); + for (int i = 0; i < n; ++i) {{ + void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL); + if (handle) {{ + lib = handle; + // printf("[triton] chosen %s\\n", hipLibSearchPaths[i]); + }} + }} + }} + if (!lib) {{ + PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so"); + return false; + }} + + // Resolve all symbols we are interested in. + dlerror(); // Clear existing errors + const char *error = NULL; +#define QUERY_EACH_FN(hipSymbolName, ...) \\ + *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName); \\ + error = dlerror(); \\ + if (error) {{ \\ + PyErr_SetString(PyExc_RuntimeError, \\ + "cannot query " #hipSymbolName " from libamdhip64.so"); \\ + dlclose(lib); \\ + return false; \\ + }} + + HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN) + + return true; +}} + +static inline void gpuAssert(hipError_t code, const char *file, int line) +{{ + if (code != HIP_SUCCESS) + {{ + const char* prefix = "Triton Error [HIP]: "; + const char* str = hipSymbolTable.hipGetErrorString(code); + char err[1024] = {{0}}; + snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str ); + PyErr_SetString(PyExc_RuntimeError, err); + }} +}} + +#define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} + +static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{ + // printf("_launch hip kernel\\n"); + void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }}; + if (gridX*gridY*gridZ > 0) {{ + HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0)); + }} + }} + +typedef struct _DevicePtrInfo {{ + hipDeviceptr_t dev_ptr; + bool valid; +}} DevicePtrInfo; + +static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{ + DevicePtrInfo ptr_info; + ptr_info.dev_ptr = 0; + ptr_info.valid = true; + if (PyLong_Check(obj)) {{ + ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj); + return ptr_info; + }} + if (obj == Py_None) {{ + // valid nullptr + return ptr_info; + }} + PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr"); + if(ptr){{ + PyObject *empty_tuple = PyTuple_New(0); + PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL); + Py_DECREF(empty_tuple); + Py_DECREF(ptr); + if (!PyLong_Check(ret)) {{ + PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); + ptr_info.valid = false; + return ptr_info; + }} + ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret); + if(!ptr_info.dev_ptr) + return ptr_info; + uint64_t dev_ptr; + hipError_t status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr); + if (status == hipErrorInvalidValue) {{ + PyErr_Format(PyExc_ValueError, + "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx); + ptr_info.valid = false; + }} + ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr; + Py_DECREF(ret); + return ptr_info; + }} + PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); + return ptr_info; +}} + +static PyObject* launch(PyObject* self, PyObject* args) {{ + // printf("launch\\n"); + int gridX, gridY, gridZ; + uint64_t _stream; + uint64_t _function; + PyObject *launch_enter_hook = NULL; + PyObject *launch_exit_hook = NULL; + PyObject *kernel_metadata = NULL; + PyObject *launch_metadata = NULL; + {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])} + if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function, + &kernel_metadata, &launch_metadata, + &launch_enter_hook, &launch_exit_hook {args_list})) {{ + return NULL; + }} + + // extract kernel metadata + int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ; + if (!PyArg_ParseTuple(kernel_metadata, \"iiiiii\", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {{ + return NULL; + }} + // extract launch metadata + if (launch_enter_hook != Py_None){{ + PyObject* args = Py_BuildValue("(O)", launch_metadata); + PyObject* ret = PyObject_CallObject(launch_enter_hook, args); + Py_DECREF(args); + if (!ret) + return NULL; + }} + + + // raise exception asap + {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])}; + _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function{', ' + ', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items()) if len(signature) > 0 else ''}); + + if(launch_exit_hook != Py_None){{ + PyObject* args = Py_BuildValue("(O)", launch_metadata); + PyObject* ret = PyObject_CallObject(launch_exit_hook, args); + Py_DECREF(args); + if (!ret) + return NULL; + }} + + if(PyErr_Occurred()) {{ + return NULL; + }} + // return None + Py_INCREF(Py_None); + return Py_None; +}} + +static PyMethodDef ModuleMethods[] = {{ + {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, + {{NULL, NULL, 0, NULL}} // sentinel +}}; + +static struct PyModuleDef ModuleDef = {{ + PyModuleDef_HEAD_INIT, + \"__triton_launcher\", + NULL, //documentation + -1, //size + ModuleMethods +}}; + +PyMODINIT_FUNC PyInit___triton_launcher(void) {{ + if (!initSymbolTable()) {{ + return NULL; + }} + PyObject *m = PyModule_Create(&ModuleDef); + if(m == NULL) {{ + return NULL; + }} + PyModule_AddFunctions(m, ModuleMethods); + return m; +}} +""" + return src + + +class HIPLauncher(object): + + def __init__(self, src, metadata): + ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()} + constants = src.constants if hasattr(src, "constants") else dict() + cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i + constants = {cst_key(key): value for key, value in constants.items()} + signature = {cst_key(key): value for key, value in src.signature.items()} + src = make_launcher(constants, signature, ids, metadata.warp_size) + mod = compile_module_from_src(src, "__triton_launcher") + self.launch = mod.launch + + def __call__(self, *args, **kwargs): + self.launch(*args, **kwargs) + + +class HIPDriver(GPUDriver): + + def __init__(self): + super().__init__() + self.utils = HIPUtils() + self.launcher_cls = HIPLauncher + + @staticmethod + def is_active(): + import torch + return torch.version.hip is not None + + def get_current_target(self): + device = self.get_current_device() + device_properties = self.utils.get_device_properties(device) + arch = device_properties['arch'] + warp_size = device_properties['warpSize'] + return GPUTarget("hip", arch.split(':')[0], warp_size) diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..20dd9703df4e1027ab30bbe12b320aa3178ea3eb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h @@ -0,0 +1,75 @@ +/* +Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +//! HIP = Heterogeneous-compute Interface for Portability +//! +//! Define a extremely thin runtime layer that allows source code to be compiled unmodified +//! through either AMD CLANG or NVCC. Key features tend to be in the spirit +//! and terminology of CUDA, but with a portable path to other accelerators as well: +// +//! Both paths support rich C++ features including classes, templates, lambdas, etc. +//! Runtime API is C +//! Memory management is based on pure pointers and resembles malloc/free/copy. +// +//! hip_runtime.h : includes everything in hip_api.h, plus math builtins and kernel launch +//! macros. hip_runtime_api.h : Defines HIP API. This is a C header file and does not use any C++ +//! features. + +#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_H +#define HIP_INCLUDE_HIP_HIP_RUNTIME_H + +#if __HIP_DEVICE_COMPILE__ && !__GFX7__ && !__GFX8__ && !__GFX9__ && __AMDGCN_WAVEFRONT_SIZE == 64 +#error HIP is not supported on the specified GPU ARCH with wavefront size 64 +#endif + +#if !defined(__HIPCC_RTC__) +// Some standard header files, these are included by hc.hpp and so want to make them avail on both +// paths to provide a consistent include env and avoid "missing symbol" errors that only appears +// on NVCC path: +#include +#include +#include +#include + +#if __cplusplus > 199711L +#include +#endif +#endif // !defined(__HIPCC_RTC__) + +#include +#include + +#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) +#include +#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) +#include +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + +#if !defined(__HIPCC_RTC__) +#include +#include +#endif // !defined(__HIPCC_RTC__) +#include + +#endif diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h new file mode 100644 index 0000000000000000000000000000000000000000..3473c8188637995720d20e614e85d06d8bdf7d59 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h @@ -0,0 +1,194 @@ +/* +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H +#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wc++98-compat" +#endif + +#if !defined(__HIPCC_RTC__) +#include +#endif + +#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) +#include "texture_types.h" +#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) +/******************************************************************************* + * * + * * + * * + *******************************************************************************/ +#if !defined(__HIPCC_RTC__) +#include +#include +#include +#endif // !defined(__HIPCC_RTC__) + +#define hipTextureType1D 0x01 +#define hipTextureType2D 0x02 +#define hipTextureType3D 0x03 +#define hipTextureTypeCubemap 0x0C +#define hipTextureType1DLayered 0xF1 +#define hipTextureType2DLayered 0xF2 +#define hipTextureTypeCubemapLayered 0xFC + +/** + * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD + */ +#define HIP_IMAGE_OBJECT_SIZE_DWORD 12 +#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8 +#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD +#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD) + +/** + * An opaque value that represents a hip texture object + */ +struct __hip_texture; +typedef struct __hip_texture* hipTextureObject_t; + +/** + * hip texture address modes + */ +enum hipTextureAddressMode { + hipAddressModeWrap = 0, + hipAddressModeClamp = 1, + hipAddressModeMirror = 2, + hipAddressModeBorder = 3 +}; + +/** + * hip texture filter modes + */ +enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 }; + +/** + * hip texture read modes + */ +enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 }; + +/** + * hip texture reference + */ +typedef struct textureReference { + int normalized; + enum hipTextureReadMode readMode;// used only for driver API's + enum hipTextureFilterMode filterMode; + enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions + struct hipChannelFormatDesc channelDesc; + int sRGB; // Perform sRGB->linear conversion during texture read + unsigned int maxAnisotropy; // Limit to the anisotropy ratio + enum hipTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + + hipTextureObject_t textureObject; + int numChannels; + enum hipArray_Format format; +}textureReference; + +/** + * hip texture descriptor + */ +typedef struct hipTextureDesc { + enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions + enum hipTextureFilterMode filterMode; + enum hipTextureReadMode readMode; + int sRGB; // Perform sRGB->linear conversion during texture read + float borderColor[4]; + int normalizedCoords; + unsigned int maxAnisotropy; + enum hipTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; +}hipTextureDesc; + +#if __cplusplus + +/******************************************************************************* + * * + * * + * * + *******************************************************************************/ +#if __HIP__ +#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type)) +#else +#define __HIP_TEXTURE_ATTRIB +#endif + +typedef textureReference* hipTexRef; + +template +struct __HIP_TEXTURE_ATTRIB texture : public textureReference { + texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint, + enum hipTextureAddressMode aMode = hipAddressModeClamp) { + normalized = norm; + readMode = mode; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = hipCreateChannelDesc(); + sRGB = 0; + textureObject = nullptr; + maxAnisotropy = 0; + mipmapLevelBias = 0; + minMipmapLevelClamp = 0; + maxMipmapLevelClamp = 0; + } + + texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode, + struct hipChannelFormatDesc desc) { + normalized = norm; + readMode = mode; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = desc; + sRGB = 0; + textureObject = nullptr; + maxAnisotropy = 0; + mipmapLevelBias = 0; + minMipmapLevelClamp = 0; + maxMipmapLevelClamp = 0; + } +}; + +#endif /* __cplusplus */ + +#else +#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); +#endif + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +#endif diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b694de5b421ecc7f647562b68258a0743dfab943 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..217f8ae237a5e5cd6e954b395d7c3461a6e34529 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c823afef7031ed45d44b498c813633387ed5cad5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h new file mode 100644 index 0000000000000000000000000000000000000000..b7ea50da7beb2187e77f7606dd70faed0e4b4add --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h @@ -0,0 +1,98 @@ +/* + * Copyright 2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#include + +#if !defined(_CUPTI_OPENACC_H_) +#define _CUPTI_OPENACC_H_ + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#if defined(__LP64__) +#define CUPTILP64 1 +#elif defined(_WIN64) +#define CUPTILP64 1 +#else +#undef CUPTILP64 +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \brief Initialize OpenACC support + * + * \param profRegister function of type acc_prof_reg as obtained from acc_register_library + * \param profUnregister function of type acc_prof_reg as obtained from acc_register_library + * \param profLookup function of type acc_prof_lookup as obtained from acc_register_library + */ +CUptiResult CUPTIAPI +cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup); + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_OPENACC_H_*/ + diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h new file mode 100644 index 0000000000000000000000000000000000000000..303dd42878fb02774d872c197ccc27b17f2af69e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h @@ -0,0 +1,100 @@ +/* + * Copyright 2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#include +#include "Openmp/omp-tools.h" + +#if !defined(_CUPTI_OPENMP_H_) +#define _CUPTI_OPENMP_H_ + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#if defined(__LP64__) +#define CUPTILP64 1 +#elif defined(_WIN64) +#define CUPTILP64 1 +#else +#undef CUPTILP64 +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0) + * + */ +int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version); + +/** + * \brief Initialize OPENMP support + * + */ +int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data); + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_OPENMP_H_*/ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h new file mode 100644 index 0000000000000000000000000000000000000000..276967d07e8f8c0f7686e5b3b15151edf2415ae7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h @@ -0,0 +1,1083 @@ +/* + * include/50/omp-tools.h.var + */ + +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// + +#ifndef __OMPT__ +#define __OMPT__ + +/***************************************************************************** + * system include files + *****************************************************************************/ + +#include +#include + +/***************************************************************************** + * iteration macros + *****************************************************************************/ + +#define FOREACH_OMPT_INQUIRY_FN(macro) \ + macro (ompt_enumerate_states) \ + macro (ompt_enumerate_mutex_impls) \ + \ + macro (ompt_set_callback) \ + macro (ompt_get_callback) \ + \ + macro (ompt_get_state) \ + \ + macro (ompt_get_parallel_info) \ + macro (ompt_get_task_info) \ + macro (ompt_get_task_memory) \ + macro (ompt_get_thread_data) \ + macro (ompt_get_unique_id) \ + macro (ompt_finalize_tool) \ + \ + macro(ompt_get_num_procs) \ + macro(ompt_get_num_places) \ + macro(ompt_get_place_proc_ids) \ + macro(ompt_get_place_num) \ + macro(ompt_get_partition_place_nums) \ + macro(ompt_get_proc_id) \ + \ + macro(ompt_get_target_info) \ + macro(ompt_get_num_devices) + +#define FOREACH_OMPT_STATE(macro) \ + \ + /* first available state */ \ + macro (ompt_state_undefined, 0x102) /* undefined thread state */ \ + \ + /* work states (0..15) */ \ + macro (ompt_state_work_serial, 0x000) /* working outside parallel */ \ + macro (ompt_state_work_parallel, 0x001) /* working within parallel */ \ + macro (ompt_state_work_reduction, 0x002) /* performing a reduction */ \ + \ + /* barrier wait states (16..31) */ \ + macro (ompt_state_wait_barrier, 0x010) /* waiting at a barrier */ \ + macro (ompt_state_wait_barrier_implicit_parallel, 0x011) \ + /* implicit barrier at the end of parallel region */\ + macro (ompt_state_wait_barrier_implicit_workshare, 0x012) \ + /* implicit barrier at the end of worksharing */ \ + macro (ompt_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \ + macro (ompt_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \ + \ + /* task wait states (32..63) */ \ + macro (ompt_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \ + macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */ \ + \ + /* mutex wait states (64..127) */ \ + macro (ompt_state_wait_mutex, 0x040) \ + macro (ompt_state_wait_lock, 0x041) /* waiting for lock */ \ + macro (ompt_state_wait_critical, 0x042) /* waiting for critical */ \ + macro (ompt_state_wait_atomic, 0x043) /* waiting for atomic */ \ + macro (ompt_state_wait_ordered, 0x044) /* waiting for ordered */ \ + \ + /* target wait states (128..255) */ \ + macro (ompt_state_wait_target, 0x080) /* waiting for target region */ \ + macro (ompt_state_wait_target_map, 0x081) /* waiting for target data mapping operation */ \ + macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */ \ + \ + /* misc (256..511) */ \ + macro (ompt_state_idle, 0x100) /* waiting for work */ \ + macro (ompt_state_overhead, 0x101) /* overhead excluding wait states */ \ + \ + /* implementation-specific states (512..) */ + + +#define FOREACH_KMP_MUTEX_IMPL(macro) \ + macro (kmp_mutex_impl_none, 0) /* unknown implementation */ \ + macro (kmp_mutex_impl_spin, 1) /* based on spin */ \ + macro (kmp_mutex_impl_queuing, 2) /* based on some fair policy */ \ + macro (kmp_mutex_impl_speculative, 3) /* based on HW-supported speculation */ + +#define FOREACH_OMPT_EVENT(macro) \ + \ + /*--- Mandatory Events ---*/ \ + macro (ompt_callback_thread_begin, ompt_callback_thread_begin_t, 1) /* thread begin */ \ + macro (ompt_callback_thread_end, ompt_callback_thread_end_t, 2) /* thread end */ \ + \ + macro (ompt_callback_parallel_begin, ompt_callback_parallel_begin_t, 3) /* parallel begin */ \ + macro (ompt_callback_parallel_end, ompt_callback_parallel_end_t, 4) /* parallel end */ \ + \ + macro (ompt_callback_task_create, ompt_callback_task_create_t, 5) /* task begin */ \ + macro (ompt_callback_task_schedule, ompt_callback_task_schedule_t, 6) /* task schedule */ \ + macro (ompt_callback_implicit_task, ompt_callback_implicit_task_t, 7) /* implicit task */ \ + \ + macro (ompt_callback_target, ompt_callback_target_t, 8) /* target */ \ + macro (ompt_callback_target_data_op, ompt_callback_target_data_op_t, 9) /* target data op */ \ + macro (ompt_callback_target_submit, ompt_callback_target_submit_t, 10) /* target submit */ \ + \ + macro (ompt_callback_control_tool, ompt_callback_control_tool_t, 11) /* control tool */ \ + \ + macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize */ \ + macro (ompt_callback_device_finalize, ompt_callback_device_finalize_t, 13) /* device finalize */ \ + \ + macro (ompt_callback_device_load, ompt_callback_device_load_t, 14) /* device load */ \ + macro (ompt_callback_device_unload, ompt_callback_device_unload_t, 15) /* device unload */ \ + \ + /* Optional Events */ \ + macro (ompt_callback_sync_region_wait, ompt_callback_sync_region_t, 16) /* sync region wait begin or end */ \ + \ + macro (ompt_callback_mutex_released, ompt_callback_mutex_t, 17) /* mutex released */ \ + \ + macro (ompt_callback_dependences, ompt_callback_dependences_t, 18) /* report task dependences */ \ + macro (ompt_callback_task_dependence, ompt_callback_task_dependence_t, 19) /* report task dependence */ \ + \ + macro (ompt_callback_work, ompt_callback_work_t, 20) /* task at work begin or end */ \ + \ + macro (ompt_callback_master, ompt_callback_master_t, 21) /* task at master begin or end */ \ + \ + macro (ompt_callback_target_map, ompt_callback_target_map_t, 22) /* target map */ \ + \ + macro (ompt_callback_sync_region, ompt_callback_sync_region_t, 23) /* sync region begin or end */ \ + \ + macro (ompt_callback_lock_init, ompt_callback_mutex_acquire_t, 24) /* lock init */ \ + macro (ompt_callback_lock_destroy, ompt_callback_mutex_t, 25) /* lock destroy */ \ + \ + macro (ompt_callback_mutex_acquire, ompt_callback_mutex_acquire_t, 26) /* mutex acquire */ \ + macro (ompt_callback_mutex_acquired, ompt_callback_mutex_t, 27) /* mutex acquired */ \ + \ + macro (ompt_callback_nest_lock, ompt_callback_nest_lock_t, 28) /* nest lock */ \ + \ + macro (ompt_callback_flush, ompt_callback_flush_t, 29) /* after executing flush */ \ + \ + macro (ompt_callback_cancel, ompt_callback_cancel_t, 30) /* cancel innermost binding region */ \ + \ + macro (ompt_callback_reduction, ompt_callback_sync_region_t, 31) /* reduction */ \ + \ + macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */ + +/***************************************************************************** + * implementation specific types + *****************************************************************************/ + +typedef enum kmp_mutex_impl_t { +#define kmp_mutex_impl_macro(impl, code) impl = code, + FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro) +#undef kmp_mutex_impl_macro +} kmp_mutex_impl_t; + +/***************************************************************************** + * definitions generated from spec + *****************************************************************************/ + +typedef enum ompt_callbacks_t { + ompt_callback_thread_begin = 1, + ompt_callback_thread_end = 2, + ompt_callback_parallel_begin = 3, + ompt_callback_parallel_end = 4, + ompt_callback_task_create = 5, + ompt_callback_task_schedule = 6, + ompt_callback_implicit_task = 7, + ompt_callback_target = 8, + ompt_callback_target_data_op = 9, + ompt_callback_target_submit = 10, + ompt_callback_control_tool = 11, + ompt_callback_device_initialize = 12, + ompt_callback_device_finalize = 13, + ompt_callback_device_load = 14, + ompt_callback_device_unload = 15, + ompt_callback_sync_region_wait = 16, + ompt_callback_mutex_released = 17, + ompt_callback_dependences = 18, + ompt_callback_task_dependence = 19, + ompt_callback_work = 20, + ompt_callback_master = 21, + ompt_callback_target_map = 22, + ompt_callback_sync_region = 23, + ompt_callback_lock_init = 24, + ompt_callback_lock_destroy = 25, + ompt_callback_mutex_acquire = 26, + ompt_callback_mutex_acquired = 27, + ompt_callback_nest_lock = 28, + ompt_callback_flush = 29, + ompt_callback_cancel = 30, + ompt_callback_reduction = 31, + ompt_callback_dispatch = 32 +} ompt_callbacks_t; + +typedef enum ompt_record_t { + ompt_record_ompt = 1, + ompt_record_native = 2, + ompt_record_invalid = 3 +} ompt_record_t; + +typedef enum ompt_record_native_t { + ompt_record_native_info = 1, + ompt_record_native_event = 2 +} ompt_record_native_t; + +typedef enum ompt_set_result_t { + ompt_set_error = 0, + ompt_set_never = 1, + ompt_set_impossible = 2, + ompt_set_sometimes = 3, + ompt_set_sometimes_paired = 4, + ompt_set_always = 5 +} ompt_set_result_t; + +typedef uint64_t ompt_id_t; + +typedef uint64_t ompt_device_time_t; + +typedef uint64_t ompt_buffer_cursor_t; + +typedef enum ompt_thread_t { + ompt_thread_initial = 1, + ompt_thread_worker = 2, + ompt_thread_other = 3, + ompt_thread_unknown = 4 +} ompt_thread_t; + +typedef enum ompt_scope_endpoint_t { + ompt_scope_begin = 1, + ompt_scope_end = 2 +} ompt_scope_endpoint_t; + +typedef enum ompt_dispatch_t { + ompt_dispatch_iteration = 1, + ompt_dispatch_section = 2 +} ompt_dispatch_t; + +typedef enum ompt_sync_region_t { + ompt_sync_region_barrier = 1, + ompt_sync_region_barrier_implicit = 2, + ompt_sync_region_barrier_explicit = 3, + ompt_sync_region_barrier_implementation = 4, + ompt_sync_region_taskwait = 5, + ompt_sync_region_taskgroup = 6, + ompt_sync_region_reduction = 7 +} ompt_sync_region_t; + +typedef enum ompt_target_data_op_t { + ompt_target_data_alloc = 1, + ompt_target_data_transfer_to_device = 2, + ompt_target_data_transfer_from_device = 3, + ompt_target_data_delete = 4, + ompt_target_data_associate = 5, + ompt_target_data_disassociate = 6 +} ompt_target_data_op_t; + +typedef enum ompt_work_t { + ompt_work_loop = 1, + ompt_work_sections = 2, + ompt_work_single_executor = 3, + ompt_work_single_other = 4, + ompt_work_workshare = 5, + ompt_work_distribute = 6, + ompt_work_taskloop = 7 +} ompt_work_t; + +typedef enum ompt_mutex_t { + ompt_mutex_lock = 1, + ompt_mutex_test_lock = 2, + ompt_mutex_nest_lock = 3, + ompt_mutex_test_nest_lock = 4, + ompt_mutex_critical = 5, + ompt_mutex_atomic = 6, + ompt_mutex_ordered = 7 +} ompt_mutex_t; + +typedef enum ompt_native_mon_flag_t { + ompt_native_data_motion_explicit = 0x01, + ompt_native_data_motion_implicit = 0x02, + ompt_native_kernel_invocation = 0x04, + ompt_native_kernel_execution = 0x08, + ompt_native_driver = 0x10, + ompt_native_runtime = 0x20, + ompt_native_overhead = 0x40, + ompt_native_idleness = 0x80 +} ompt_native_mon_flag_t; + +typedef enum ompt_task_flag_t { + ompt_task_initial = 0x00000001, + ompt_task_implicit = 0x00000002, + ompt_task_explicit = 0x00000004, + ompt_task_target = 0x00000008, + ompt_task_undeferred = 0x08000000, + ompt_task_untied = 0x10000000, + ompt_task_final = 0x20000000, + ompt_task_mergeable = 0x40000000, + ompt_task_merged = 0x80000000 +} ompt_task_flag_t; + +typedef enum ompt_task_status_t { + ompt_task_complete = 1, + ompt_task_yield = 2, + ompt_task_cancel = 3, + ompt_task_detach = 4, + ompt_task_early_fulfill = 5, + ompt_task_late_fulfill = 6, + ompt_task_switch = 7 +} ompt_task_status_t; + +typedef enum ompt_target_t { + ompt_target = 1, + ompt_target_enter_data = 2, + ompt_target_exit_data = 3, + ompt_target_update = 4 +} ompt_target_t; + +typedef enum ompt_parallel_flag_t { + ompt_parallel_invoker_program = 0x00000001, + ompt_parallel_invoker_runtime = 0x00000002, + ompt_parallel_league = 0x40000000, + ompt_parallel_team = 0x80000000 +} ompt_parallel_flag_t; + +typedef enum ompt_target_map_flag_t { + ompt_target_map_flag_to = 0x01, + ompt_target_map_flag_from = 0x02, + ompt_target_map_flag_alloc = 0x04, + ompt_target_map_flag_release = 0x08, + ompt_target_map_flag_delete = 0x10, + ompt_target_map_flag_implicit = 0x20 +} ompt_target_map_flag_t; + +typedef enum ompt_dependence_type_t { + ompt_dependence_type_in = 1, + ompt_dependence_type_out = 2, + ompt_dependence_type_inout = 3, + ompt_dependence_type_mutexinoutset = 4, + ompt_dependence_type_source = 5, + ompt_dependence_type_sink = 6 +} ompt_dependence_type_t; + +typedef enum ompt_cancel_flag_t { + ompt_cancel_parallel = 0x01, + ompt_cancel_sections = 0x02, + ompt_cancel_loop = 0x04, + ompt_cancel_taskgroup = 0x08, + ompt_cancel_activated = 0x10, + ompt_cancel_detected = 0x20, + ompt_cancel_discarded_task = 0x40 +} ompt_cancel_flag_t; + +typedef uint64_t ompt_hwid_t; + +typedef uint64_t ompt_wait_id_t; + +typedef enum ompt_frame_flag_t { + ompt_frame_runtime = 0x00, + ompt_frame_application = 0x01, + ompt_frame_cfa = 0x10, + ompt_frame_framepointer = 0x20, + ompt_frame_stackaddress = 0x30 +} ompt_frame_flag_t; + +typedef enum ompt_state_t { + ompt_state_work_serial = 0x000, + ompt_state_work_parallel = 0x001, + ompt_state_work_reduction = 0x002, + + ompt_state_wait_barrier = 0x010, + ompt_state_wait_barrier_implicit_parallel = 0x011, + ompt_state_wait_barrier_implicit_workshare = 0x012, + ompt_state_wait_barrier_implicit = 0x013, + ompt_state_wait_barrier_explicit = 0x014, + + ompt_state_wait_taskwait = 0x020, + ompt_state_wait_taskgroup = 0x021, + + ompt_state_wait_mutex = 0x040, + ompt_state_wait_lock = 0x041, + ompt_state_wait_critical = 0x042, + ompt_state_wait_atomic = 0x043, + ompt_state_wait_ordered = 0x044, + + ompt_state_wait_target = 0x080, + ompt_state_wait_target_map = 0x081, + ompt_state_wait_target_update = 0x082, + + ompt_state_idle = 0x100, + ompt_state_overhead = 0x101, + ompt_state_undefined = 0x102 +} ompt_state_t; + +typedef uint64_t (*ompt_get_unique_id_t) (void); + +typedef uint64_t ompd_size_t; + +typedef uint64_t ompd_wait_id_t; + +typedef uint64_t ompd_addr_t; +typedef int64_t ompd_word_t; +typedef uint64_t ompd_seg_t; + +typedef uint64_t ompd_device_t; + +typedef uint64_t ompd_thread_id_t; + +typedef enum ompd_scope_t { + ompd_scope_global = 1, + ompd_scope_address_space = 2, + ompd_scope_thread = 3, + ompd_scope_parallel = 4, + ompd_scope_implicit_task = 5, + ompd_scope_task = 6 +} ompd_scope_t; + +typedef uint64_t ompd_icv_id_t; + +typedef enum ompd_rc_t { + ompd_rc_ok = 0, + ompd_rc_unavailable = 1, + ompd_rc_stale_handle = 2, + ompd_rc_bad_input = 3, + ompd_rc_error = 4, + ompd_rc_unsupported = 5, + ompd_rc_needs_state_tracking = 6, + ompd_rc_incompatible = 7, + ompd_rc_device_read_error = 8, + ompd_rc_device_write_error = 9, + ompd_rc_nomem = 10, +} ompd_rc_t; + +typedef void (*ompt_interface_fn_t) (void); + +typedef ompt_interface_fn_t (*ompt_function_lookup_t) ( + const char *interface_function_name +); + +typedef union ompt_data_t { + uint64_t value; + void *ptr; +} ompt_data_t; + +typedef struct ompt_frame_t { + ompt_data_t exit_frame; + ompt_data_t enter_frame; + int exit_frame_flags; + int enter_frame_flags; +} ompt_frame_t; + +typedef void (*ompt_callback_t) (void); + +typedef void ompt_device_t; + +typedef void ompt_buffer_t; + +typedef void (*ompt_callback_buffer_request_t) ( + int device_num, + ompt_buffer_t **buffer, + size_t *bytes +); + +typedef void (*ompt_callback_buffer_complete_t) ( + int device_num, + ompt_buffer_t *buffer, + size_t bytes, + ompt_buffer_cursor_t begin, + int buffer_owned +); + +typedef void (*ompt_finalize_t) ( + ompt_data_t *tool_data +); + +typedef int (*ompt_initialize_t) ( + ompt_function_lookup_t lookup, + int initial_device_num, + ompt_data_t *tool_data +); + +typedef struct ompt_start_tool_result_t { + ompt_initialize_t initialize; + ompt_finalize_t finalize; + ompt_data_t tool_data; +} ompt_start_tool_result_t; + +typedef struct ompt_record_abstract_t { + ompt_record_native_t rclass; + const char *type; + ompt_device_time_t start_time; + ompt_device_time_t end_time; + ompt_hwid_t hwid; +} ompt_record_abstract_t; + +typedef struct ompt_dependence_t { + ompt_data_t variable; + ompt_dependence_type_t dependence_type; +} ompt_dependence_t; + +typedef int (*ompt_enumerate_states_t) ( + int current_state, + int *next_state, + const char **next_state_name +); + +typedef int (*ompt_enumerate_mutex_impls_t) ( + int current_impl, + int *next_impl, + const char **next_impl_name +); + +typedef ompt_set_result_t (*ompt_set_callback_t) ( + ompt_callbacks_t event, + ompt_callback_t callback +); + +typedef int (*ompt_get_callback_t) ( + ompt_callbacks_t event, + ompt_callback_t *callback +); + +typedef ompt_data_t *(*ompt_get_thread_data_t) (void); + +typedef int (*ompt_get_num_procs_t) (void); + +typedef int (*ompt_get_num_places_t) (void); + +typedef int (*ompt_get_place_proc_ids_t) ( + int place_num, + int ids_size, + int *ids +); + +typedef int (*ompt_get_place_num_t) (void); + +typedef int (*ompt_get_partition_place_nums_t) ( + int place_nums_size, + int *place_nums +); + +typedef int (*ompt_get_proc_id_t) (void); + +typedef int (*ompt_get_state_t) ( + ompt_wait_id_t *wait_id +); + +typedef int (*ompt_get_parallel_info_t) ( + int ancestor_level, + ompt_data_t **parallel_data, + int *team_size +); + +typedef int (*ompt_get_task_info_t) ( + int ancestor_level, + int *flags, + ompt_data_t **task_data, + ompt_frame_t **task_frame, + ompt_data_t **parallel_data, + int *thread_num +); + +typedef int (*ompt_get_task_memory_t)( + void **addr, + size_t *size, + int block +); + +typedef int (*ompt_get_target_info_t) ( + uint64_t *device_num, + ompt_id_t *target_id, + ompt_id_t *host_op_id +); + +typedef int (*ompt_get_num_devices_t) (void); + +typedef void (*ompt_finalize_tool_t) (void); + +typedef int (*ompt_get_device_num_procs_t) ( + ompt_device_t *device +); + +typedef ompt_device_time_t (*ompt_get_device_time_t) ( + ompt_device_t *device +); + +typedef double (*ompt_translate_time_t) ( + ompt_device_t *device, + ompt_device_time_t time +); + +typedef ompt_set_result_t (*ompt_set_trace_ompt_t) ( + ompt_device_t *device, + unsigned int enable, + unsigned int etype +); + +typedef ompt_set_result_t (*ompt_set_trace_native_t) ( + ompt_device_t *device, + int enable, + int flags +); + +typedef int (*ompt_start_trace_t) ( + ompt_device_t *device, + ompt_callback_buffer_request_t request, + ompt_callback_buffer_complete_t complete +); + +typedef int (*ompt_pause_trace_t) ( + ompt_device_t *device, + int begin_pause +); + +typedef int (*ompt_flush_trace_t) ( + ompt_device_t *device +); + +typedef int (*ompt_stop_trace_t) ( + ompt_device_t *device +); + +typedef int (*ompt_advance_buffer_cursor_t) ( + ompt_device_t *device, + ompt_buffer_t *buffer, + size_t size, + ompt_buffer_cursor_t current, + ompt_buffer_cursor_t *next +); + +typedef ompt_record_t (*ompt_get_record_type_t) ( + ompt_buffer_t *buffer, + ompt_buffer_cursor_t current +); + +typedef void *(*ompt_get_record_native_t) ( + ompt_buffer_t *buffer, + ompt_buffer_cursor_t current, + ompt_id_t *host_op_id +); + +typedef ompt_record_abstract_t * +(*ompt_get_record_abstract_t) ( + void *native_record +); + +typedef void (*ompt_callback_thread_begin_t) ( + ompt_thread_t thread_type, + ompt_data_t *thread_data +); + +typedef struct ompt_record_thread_begin_t { + ompt_thread_t thread_type; +} ompt_record_thread_begin_t; + +typedef void (*ompt_callback_thread_end_t) ( + ompt_data_t *thread_data +); + +typedef void (*ompt_callback_parallel_begin_t) ( + ompt_data_t *encountering_task_data, + const ompt_frame_t *encountering_task_frame, + ompt_data_t *parallel_data, + unsigned int requested_parallelism, + int flags, + const void *codeptr_ra +); + +typedef struct ompt_record_parallel_begin_t { + ompt_id_t encountering_task_id; + ompt_id_t parallel_id; + unsigned int requested_parallelism; + int flags; + const void *codeptr_ra; +} ompt_record_parallel_begin_t; + +typedef void (*ompt_callback_parallel_end_t) ( + ompt_data_t *parallel_data, + ompt_data_t *encountering_task_data, + int flags, + const void *codeptr_ra +); + +typedef struct ompt_record_parallel_end_t { + ompt_id_t parallel_id; + ompt_id_t encountering_task_id; + int flags; + const void *codeptr_ra; +} ompt_record_parallel_end_t; + +typedef void (*ompt_callback_work_t) ( + ompt_work_t wstype, + ompt_scope_endpoint_t endpoint, + ompt_data_t *parallel_data, + ompt_data_t *task_data, + uint64_t count, + const void *codeptr_ra +); + +typedef struct ompt_record_work_t { + ompt_work_t wstype; + ompt_scope_endpoint_t endpoint; + ompt_id_t parallel_id; + ompt_id_t task_id; + uint64_t count; + const void *codeptr_ra; +} ompt_record_work_t; + +typedef void (*ompt_callback_dispatch_t) ( + ompt_data_t *parallel_data, + ompt_data_t *task_data, + ompt_dispatch_t kind, + ompt_data_t instance +); + +typedef struct ompt_record_dispatch_t { + ompt_id_t parallel_id; + ompt_id_t task_id; + ompt_dispatch_t kind; + ompt_data_t instance; +} ompt_record_dispatch_t; + +typedef void (*ompt_callback_task_create_t) ( + ompt_data_t *encountering_task_data, + const ompt_frame_t *encountering_task_frame, + ompt_data_t *new_task_data, + int flags, + int has_dependences, + const void *codeptr_ra +); + +typedef struct ompt_record_task_create_t { + ompt_id_t encountering_task_id; + ompt_id_t new_task_id; + int flags; + int has_dependences; + const void *codeptr_ra; +} ompt_record_task_create_t; + +typedef void (*ompt_callback_dependences_t) ( + ompt_data_t *task_data, + const ompt_dependence_t *deps, + int ndeps +); + +typedef struct ompt_record_dependences_t { + ompt_id_t task_id; + ompt_dependence_t dep; + int ndeps; +} ompt_record_dependences_t; + +typedef void (*ompt_callback_task_dependence_t) ( + ompt_data_t *src_task_data, + ompt_data_t *sink_task_data +); + +typedef struct ompt_record_task_dependence_t { + ompt_id_t src_task_id; + ompt_id_t sink_task_id; +} ompt_record_task_dependence_t; + +typedef void (*ompt_callback_task_schedule_t) ( + ompt_data_t *prior_task_data, + ompt_task_status_t prior_task_status, + ompt_data_t *next_task_data +); + +typedef struct ompt_record_task_schedule_t { + ompt_id_t prior_task_id; + ompt_task_status_t prior_task_status; + ompt_id_t next_task_id; +} ompt_record_task_schedule_t; + +typedef void (*ompt_callback_implicit_task_t) ( + ompt_scope_endpoint_t endpoint, + ompt_data_t *parallel_data, + ompt_data_t *task_data, + unsigned int actual_parallelism, + unsigned int index, + int flags +); + +typedef struct ompt_record_implicit_task_t { + ompt_scope_endpoint_t endpoint; + ompt_id_t parallel_id; + ompt_id_t task_id; + unsigned int actual_parallelism; + unsigned int index; + int flags; +} ompt_record_implicit_task_t; + +typedef void (*ompt_callback_master_t) ( + ompt_scope_endpoint_t endpoint, + ompt_data_t *parallel_data, + ompt_data_t *task_data, + const void *codeptr_ra +); + +typedef struct ompt_record_master_t { + ompt_scope_endpoint_t endpoint; + ompt_id_t parallel_id; + ompt_id_t task_id; + const void *codeptr_ra; +} ompt_record_master_t; + +typedef void (*ompt_callback_sync_region_t) ( + ompt_sync_region_t kind, + ompt_scope_endpoint_t endpoint, + ompt_data_t *parallel_data, + ompt_data_t *task_data, + const void *codeptr_ra +); + +typedef struct ompt_record_sync_region_t { + ompt_sync_region_t kind; + ompt_scope_endpoint_t endpoint; + ompt_id_t parallel_id; + ompt_id_t task_id; + const void *codeptr_ra; +} ompt_record_sync_region_t; + +typedef void (*ompt_callback_mutex_acquire_t) ( + ompt_mutex_t kind, + unsigned int hint, + unsigned int impl, + ompt_wait_id_t wait_id, + const void *codeptr_ra +); + +typedef struct ompt_record_mutex_acquire_t { + ompt_mutex_t kind; + unsigned int hint; + unsigned int impl; + ompt_wait_id_t wait_id; + const void *codeptr_ra; +} ompt_record_mutex_acquire_t; + +typedef void (*ompt_callback_mutex_t) ( + ompt_mutex_t kind, + ompt_wait_id_t wait_id, + const void *codeptr_ra +); + +typedef struct ompt_record_mutex_t { + ompt_mutex_t kind; + ompt_wait_id_t wait_id; + const void *codeptr_ra; +} ompt_record_mutex_t; + +typedef void (*ompt_callback_nest_lock_t) ( + ompt_scope_endpoint_t endpoint, + ompt_wait_id_t wait_id, + const void *codeptr_ra +); + +typedef struct ompt_record_nest_lock_t { + ompt_scope_endpoint_t endpoint; + ompt_wait_id_t wait_id; + const void *codeptr_ra; +} ompt_record_nest_lock_t; + +typedef void (*ompt_callback_flush_t) ( + ompt_data_t *thread_data, + const void *codeptr_ra +); + +typedef struct ompt_record_flush_t { + const void *codeptr_ra; +} ompt_record_flush_t; + +typedef void (*ompt_callback_cancel_t) ( + ompt_data_t *task_data, + int flags, + const void *codeptr_ra +); + +typedef struct ompt_record_cancel_t { + ompt_id_t task_id; + int flags; + const void *codeptr_ra; +} ompt_record_cancel_t; + +typedef void (*ompt_callback_device_initialize_t) ( + int device_num, + const char *type, + ompt_device_t *device, + ompt_function_lookup_t lookup, + const char *documentation +); + +typedef void (*ompt_callback_device_finalize_t) ( + int device_num +); + +typedef void (*ompt_callback_device_load_t) ( + int device_num, + const char *filename, + int64_t offset_in_file, + void *vma_in_file, + size_t bytes, + void *host_addr, + void *device_addr, + uint64_t module_id +); + +typedef void (*ompt_callback_device_unload_t) ( + int device_num, + uint64_t module_id +); + +typedef void (*ompt_callback_target_data_op_t) ( + ompt_id_t target_id, + ompt_id_t host_op_id, + ompt_target_data_op_t optype, + void *src_addr, + int src_device_num, + void *dest_addr, + int dest_device_num, + size_t bytes, + const void *codeptr_ra +); + +typedef struct ompt_record_target_data_op_t { + ompt_id_t host_op_id; + ompt_target_data_op_t optype; + void *src_addr; + int src_device_num; + void *dest_addr; + int dest_device_num; + size_t bytes; + ompt_device_time_t end_time; + const void *codeptr_ra; +} ompt_record_target_data_op_t; + +typedef void (*ompt_callback_target_t) ( + ompt_target_t kind, + ompt_scope_endpoint_t endpoint, + int device_num, + ompt_data_t *task_data, + ompt_id_t target_id, + const void *codeptr_ra +); + +typedef struct ompt_record_target_t { + ompt_target_t kind; + ompt_scope_endpoint_t endpoint; + int device_num; + ompt_id_t task_id; + ompt_id_t target_id; + const void *codeptr_ra; +} ompt_record_target_t; + +typedef void (*ompt_callback_target_map_t) ( + ompt_id_t target_id, + unsigned int nitems, + void **host_addr, + void **device_addr, + size_t *bytes, + unsigned int *mapping_flags, + const void *codeptr_ra +); + +typedef struct ompt_record_target_map_t { + ompt_id_t target_id; + unsigned int nitems; + void **host_addr; + void **device_addr; + size_t *bytes; + unsigned int *mapping_flags; + const void *codeptr_ra; +} ompt_record_target_map_t; + +typedef void (*ompt_callback_target_submit_t) ( + ompt_id_t target_id, + ompt_id_t host_op_id, + unsigned int requested_num_teams +); + +typedef struct ompt_record_target_kernel_t { + ompt_id_t host_op_id; + unsigned int requested_num_teams; + unsigned int granted_num_teams; + ompt_device_time_t end_time; +} ompt_record_target_kernel_t; + +typedef int (*ompt_callback_control_tool_t) ( + uint64_t command, + uint64_t modifier, + void *arg, + const void *codeptr_ra +); + +typedef struct ompt_record_control_tool_t { + uint64_t command; + uint64_t modifier; + const void *codeptr_ra; +} ompt_record_control_tool_t; + +typedef struct ompd_address_t { + ompd_seg_t segment; + ompd_addr_t address; +} ompd_address_t; + +typedef struct ompd_frame_info_t { + ompd_address_t frame_address; + ompd_word_t frame_flag; +} ompd_frame_info_t; + +typedef struct _ompd_aspace_handle ompd_address_space_handle_t; +typedef struct _ompd_thread_handle ompd_thread_handle_t; +typedef struct _ompd_parallel_handle ompd_parallel_handle_t; +typedef struct _ompd_task_handle ompd_task_handle_t; + +typedef struct _ompd_aspace_cont ompd_address_space_context_t; +typedef struct _ompd_thread_cont ompd_thread_context_t; + +typedef struct ompd_device_type_sizes_t { + uint8_t sizeof_char; + uint8_t sizeof_short; + uint8_t sizeof_int; + uint8_t sizeof_long; + uint8_t sizeof_long_long; + uint8_t sizeof_pointer; +} ompd_device_type_sizes_t; + +typedef struct ompt_record_ompt_t { + ompt_callbacks_t type; + ompt_device_time_t time; + ompt_id_t thread_id; + ompt_id_t target_id; + union { + ompt_record_thread_begin_t thread_begin; + ompt_record_parallel_begin_t parallel_begin; + ompt_record_parallel_end_t parallel_end; + ompt_record_work_t work; + ompt_record_dispatch_t dispatch; + ompt_record_task_create_t task_create; + ompt_record_dependences_t dependences; + ompt_record_task_dependence_t task_dependence; + ompt_record_task_schedule_t task_schedule; + ompt_record_implicit_task_t implicit_task; + ompt_record_master_t master; + ompt_record_sync_region_t sync_region; + ompt_record_mutex_acquire_t mutex_acquire; + ompt_record_mutex_t mutex; + ompt_record_nest_lock_t nest_lock; + ompt_record_flush_t flush; + ompt_record_cancel_t cancel; + ompt_record_target_t target; + ompt_record_target_data_op_t target_data_op; + ompt_record_target_map_t target_map; + ompt_record_target_kernel_t target_kernel; + ompt_record_control_tool_t control_tool; + } record; +} ompt_record_ompt_t; + +typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) ( + ompt_buffer_t *buffer, + ompt_buffer_cursor_t current +); + +#define ompt_id_none 0 +#define ompt_data_none {0} +#define ompt_time_none 0 +#define ompt_hwid_none 0 +#define ompt_addr_none ~0 +#define ompt_mutex_impl_none 0 +#define ompt_wait_id_none 0 + +#define ompd_segment_none 0 + +#endif /* __OMPT__ */ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h new file mode 100644 index 0000000000000000000000000000000000000000..c6f039db8effce996015f901562009ebe976d832 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h @@ -0,0 +1,588 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CHANNEL_DESCRIPTOR_H__) +#define __CHANNEL_DESCRIPTOR_H__ + +#if defined(__cplusplus) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * + * @{ + */ + +/** + * \brief \hl Returns a channel descriptor using the specified format + * + * Returns a channel descriptor with format \p f and number of bits of each + * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is + * defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + * \endcode + * + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat, + * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2, + * ::cudaChannelFormatKindSignedNormalized8X4, + * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2, + * ::cudaChannelFormatKindUnsignedNormalized8X4, + * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2, + * ::cudaChannelFormatKindSignedNormalized16X4, + * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2, + * ::cudaChannelFormatKindUnsignedNormalized16X4 + * or ::cudaChannelFormatKindNV12. + * + * The format is specified by the template specialization. + * + * The template function specializes for the following scalar types: + * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float. + * The template function specializes for the following vector types: + * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}. + * The template function specializes for following cudaChannelFormatKind enum values: + * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12. + * + * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone + * + * \return + * Channel descriptor with format \p f + * + * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)", + * ::cudaGetChannelDesc, + */ +template __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(char) * 8; + +#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__) + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#if !defined(__LP64__) + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#endif /* !__LP64__ */ + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void) +{ + int e = (int)sizeof(char) * 8; + + return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12); +} + +template __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); +} + +/* Signed 8-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4); +} + +/* Unsigned 8-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4); +} + +/* Signed 16-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4); +} + +/* Unsigned 16-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4); +} + +/* NV12 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12); +} + +/* BC1 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1); +} + +/* BC1sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB); +} + +/* BC2 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2); +} + +/* BC2sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB); +} + +/* BC3 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3); +} + +/* BC3sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB); +} + +/* BC4 unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4); +} + +/* BC4 signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4); +} + +/* BC5 unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5); +} + +/* BC5 signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5); +} + +/* BC6H unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H); +} + +/* BC6H signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H); +} + +/* BC7 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7); +} + +/* BC7sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB); +} + +#endif /* __cplusplus */ + +/** @} */ +/** @} */ /* END CUDART_TEXTURE_HL */ + +#endif /* !__CHANNEL_DESCRIPTOR_H__ */ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h new file mode 100644 index 0000000000000000000000000000000000000000..e04314301c1cca9e2514f56367e1dbc45cfdad69 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h @@ -0,0 +1,1730 @@ +/* + * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_H_ +#define _COOPERATIVE_GROUPS_H_ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#include "cooperative_groups/details/info.h" +#include "cooperative_groups/details/driver_abi.h" +#include "cooperative_groups/details/helpers.h" +#include "cooperative_groups/details/memory.h" + +#if defined(_CG_HAS_STL_ATOMICS) +#include +#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope; +#else +#define _CG_THREAD_SCOPE(scope) +#endif + +_CG_BEGIN_NAMESPACE + +namespace details { + _CG_CONST_DECL unsigned int coalesced_group_id = 1; + _CG_CONST_DECL unsigned int multi_grid_group_id = 2; + _CG_CONST_DECL unsigned int grid_group_id = 3; + _CG_CONST_DECL unsigned int thread_block_id = 4; + _CG_CONST_DECL unsigned int multi_tile_group_id = 5; + _CG_CONST_DECL unsigned int cluster_group_id = 6; +} + +/** + * class thread_group; + * + * Generic thread group type, into which all groups are convertible. + * It acts as a container for all storage necessary for the derived groups, + * and will dispatch the API calls to the correct derived group. This means + * that all derived groups must implement the same interface as thread_group. + */ +class thread_group +{ +protected: + struct group_data { + unsigned int _unused : 1; + unsigned int type : 7, : 0; + }; + + struct gg_data { + details::grid_workspace *gridWs; + }; + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + struct mg_data { + unsigned long long _unused : 1; + unsigned long long type : 7; + unsigned long long handle : 56; + const details::multi_grid::multi_grid_functions *functions; + }; +#endif + + struct tg_data { + unsigned int is_tiled : 1; + unsigned int type : 7; + unsigned int size : 24; + // packed to 4b + unsigned int metaGroupSize : 16; + unsigned int metaGroupRank : 16; + // packed to 8b + unsigned int mask; + // packed to 12b + unsigned int _res; + }; + + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend class thread_block; + + union __align__(8) { + group_data group; + tg_data coalesced; + gg_data grid; +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + mg_data multi_grid; +#endif + } _data; + + _CG_QUALIFIER thread_group operator=(const thread_group& src); + + _CG_QUALIFIER thread_group(unsigned int type) { + _data.group.type = type; + _data.group._unused = false; + } + +#ifdef _CG_CPP11_FEATURES + static_assert(sizeof(tg_data) <= 16, "Failed size check"); + static_assert(sizeof(gg_data) <= 16, "Failed size check"); +# ifdef _CG_ABI_EXPERIMENTAL + static_assert(sizeof(mg_data) <= 16, "Failed size check"); +# endif +#endif + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) + + _CG_QUALIFIER unsigned long long size() const; + _CG_QUALIFIER unsigned long long num_threads() const; + _CG_QUALIFIER unsigned long long thread_rank() const; + _CG_QUALIFIER void sync() const; + _CG_QUALIFIER unsigned int get_type() const { + return _data.group.type; + } + +}; + +template +struct thread_group_base : public thread_group { + _CG_QUALIFIER thread_group_base() : thread_group(TyId) {} + _CG_STATIC_CONST_DECL unsigned int id = TyId; +}; + +#if defined(_CG_HAS_MULTI_GRID_GROUP) + +/** + * class multi_grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same system, on multiple devices within the same launched kernels. + * To use this group, the kernel must have been launched with + * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_multi_grid(); + */ + + +# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +class multi_grid_group; + +// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls +template +__device__ _CG_DEPRECATED multi_grid_group this_multi_grid(); + +class multi_grid_group : public thread_group_base +{ +private: + template + _CG_QUALIFIER multi_grid_group() { + _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics(); + _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle(); + } + + friend multi_grid_group this_multi_grid(); + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) + + _CG_QUALIFIER bool is_valid() const { + return (_data.multi_grid.handle != 0); + } + + _CG_QUALIFIER void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + _data.multi_grid.functions->sync(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned long long num_threads() const { + _CG_ASSERT(is_valid()); + return _data.multi_grid.functions->size(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned long long size() const { + return num_threads(); + } + + _CG_QUALIFIER unsigned long long thread_rank() const { + _CG_ASSERT(is_valid()); + return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned int grid_rank() const { + _CG_ASSERT(is_valid()); + return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle)); + } + + _CG_QUALIFIER unsigned int num_grids() const { + _CG_ASSERT(is_valid()); + return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle)); + } +}; +# else +class multi_grid_group +{ +private: + unsigned long long _handle; + unsigned int _size; + unsigned int _rank; + + friend _CG_QUALIFIER multi_grid_group this_multi_grid(); + + _CG_QUALIFIER multi_grid_group() { + _handle = details::multi_grid::get_intrinsic_handle(); + _size = details::multi_grid::size(_handle); + _rank = details::multi_grid::thread_rank(_handle); + } + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) + + _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const { + return (_handle != 0); + } + + _CG_QUALIFIER _CG_DEPRECATED void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + details::multi_grid::sync(_handle); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const { + _CG_ASSERT(is_valid()); + return _size; + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const { + return num_threads(); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const { + _CG_ASSERT(is_valid()); + return _rank; + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const { + _CG_ASSERT(is_valid()); + return (details::multi_grid::grid_rank(_handle)); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const { + _CG_ASSERT(is_valid()); + return (details::multi_grid::num_grids(_handle)); + } +}; +# endif + +/** + * multi_grid_group this_multi_grid() + * + * Constructs a multi_grid_group + */ +# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +template +__device__ +#else +_CG_QUALIFIER +# endif +_CG_DEPRECATED +multi_grid_group this_multi_grid() +{ + return multi_grid_group(); +} +#endif + +/** + * class grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same device within the same launched kernel. To use this group, the kernel + * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_grid(); + */ +class grid_group : public thread_group_base +{ + _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id; + friend _CG_QUALIFIER grid_group this_grid(); + +private: + _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) { + _data.grid.gridWs = gridWs; + } + + public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) + + _CG_QUALIFIER bool is_valid() const { + return (_data.grid.gridWs != NULL); + } + + _CG_QUALIFIER void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + details::grid::sync(&_data.grid.gridWs->barrier); + } + +#if defined(_CG_CPP11_FEATURES) + using arrival_token = unsigned int; + + _CG_QUALIFIER arrival_token barrier_arrive() const { + if (!is_valid()) { + _CG_ABORT(); + } + return details::grid::barrier_arrive(&_data.grid.gridWs->barrier); + } + + _CG_QUALIFIER void barrier_wait(arrival_token&& token) const { + details::grid::barrier_wait(token, &_data.grid.gridWs->barrier); + } +#endif + + _CG_STATIC_QUALIFIER unsigned long long size() { + return details::grid::size(); + } + + _CG_STATIC_QUALIFIER dim3 group_dim() { + return details::grid::grid_dim(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() { + return details::grid::dim_threads(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_threads() { + return details::grid::num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() { + return details::grid::thread_index(); + } + + _CG_STATIC_QUALIFIER unsigned long long thread_rank() { + return details::grid::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() { + return details::grid::dim_blocks(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_blocks() { + return details::grid::num_blocks(); + } + + _CG_STATIC_QUALIFIER dim3 block_index() { + return details::grid::block_index(); + } + + _CG_STATIC_QUALIFIER unsigned long long block_rank() { + return details::grid::block_rank(); + } + +# if defined(_CG_HAS_CLUSTER_GROUP) + _CG_STATIC_QUALIFIER dim3 dim_clusters() { + return details::grid::dim_clusters(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_clusters() { + return details::grid::num_clusters(); + } + + _CG_STATIC_QUALIFIER dim3 cluster_index() { + return details::grid::cluster_index(); + } + + _CG_STATIC_QUALIFIER unsigned long long cluster_rank() { + return details::grid::cluster_rank(); + } +# endif +}; + +_CG_QUALIFIER grid_group this_grid() { + // Load a workspace from the driver + grid_group gg(details::get_grid_workspace()); +#ifdef _CG_DEBUG + // *all* threads must be available to synchronize + gg.sync(); +#endif // _CG_DEBUG + return gg; +} + +#if defined(_CG_HAS_CLUSTER_GROUP) +/** + * class cluster_group + * + * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly + * divided along all dimensions to form groups of blocks, each group of which is + * a block cluster. Clustered grids are subject to various restrictions and + * limitations. Primarily, a cluster consists of at most 8 blocks by default + * (although the user is allowed to opt-in to non-standard sizes,) and clustered + * grids are subject to additional occupancy limitations due to per-cluster + * hardware resource consumption. In exchange, a block cluster is guaranteed to + * be a cooperative group, with access to all cooperative group capabilities, as + * well as cluster specific capabilities and accelerations. A cluster_group + * represents a block cluster. + * + * Constructed via this_cluster_group(); + */ +class cluster_group : public thread_group_base +{ + // Friends + friend _CG_QUALIFIER cluster_group this_cluster(); + + // Disable constructor + _CG_QUALIFIER cluster_group() + { + } + + public: + //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster) + + using arrival_token = struct {}; + + // Functionality exposed by the group + _CG_STATIC_QUALIFIER void sync() + { + return details::cluster::sync(); + } + + _CG_STATIC_QUALIFIER arrival_token barrier_arrive() + { + details::cluster::barrier_arrive(); + return arrival_token(); + } + + _CG_STATIC_QUALIFIER void barrier_wait() + { + return details::cluster::barrier_wait(); + } + + _CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&) + { + return details::cluster::barrier_wait(); + } + + _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr) + { + return details::cluster::query_shared_rank(addr); + } + + template + _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank) + { + return details::cluster::map_shared_rank(addr, rank); + } + + _CG_STATIC_QUALIFIER dim3 block_index() + { + return details::cluster::block_index(); + } + + _CG_STATIC_QUALIFIER unsigned int block_rank() + { + return details::cluster::block_rank(); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() + { + return details::cluster::thread_index(); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return details::cluster::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() + { + return details::cluster::dim_blocks(); + } + + _CG_STATIC_QUALIFIER unsigned int num_blocks() + { + return details::cluster::num_blocks(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + return details::cluster::dim_threads(); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return details::cluster::num_threads(); + } + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned int size() + { + return num_threads(); + } +}; + +/* + * cluster_group this_cluster() + * + * Constructs a cluster_group + */ +_CG_QUALIFIER cluster_group this_cluster() +{ + cluster_group cg; +#ifdef _CG_DEBUG + cg.sync(); +#endif + return cg; +} +#endif + +#if defined(_CG_CPP11_FEATURES) +class thread_block; +template +_CG_QUALIFIER thread_block this_thread_block(block_tile_memory& scratch); +#endif + +/** + * class thread_block + * + * Every GPU kernel is executed by a grid of thread blocks, and threads within + * each block are guaranteed to reside on the same streaming multiprocessor. + * A thread_block represents a thread block whose dimensions are not known until runtime. + * + * Constructed via this_thread_block(); + */ +class thread_block : public thread_group_base +{ + // Friends + friend _CG_QUALIFIER thread_block this_thread_block(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); + +#if defined(_CG_CPP11_FEATURES) + template + friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory& scratch); + template + friend class __static_size_multi_warp_tile_base; + + details::multi_warp_scratch* const tile_memory; + + template + _CG_QUALIFIER thread_block(block_tile_memory& scratch) : + tile_memory(details::get_scratch_ptr(&scratch)) { +#ifdef _CG_DEBUG + if (num_threads() > MaxBlockSize) { + details::abort(); + } +#endif +#if !defined(_CG_HAS_RESERVED_SHARED) + tile_memory->init_barriers(thread_rank()); + sync(); +#endif + } +#endif + + // Disable constructor + _CG_QUALIFIER thread_block() +#if defined(_CG_CPP11_FEATURES) + : tile_memory(details::get_scratch_ptr(NULL)) +#endif + { } + + // Internal Use + _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + details::abort(); + return (thread_block()); + } + + unsigned int mask; + unsigned int base_offset = thread_rank() & (~(tilesz - 1)); + unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); + + mask = (unsigned int)(-1) >> (32 - masklength); + mask <<= (details::laneid() & ~(tilesz - 1)); + thread_group tile = thread_group(details::coalesced_group_id); + tile._data.coalesced.mask = mask; + tile._data.coalesced.size = __popc(mask); + tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz; + tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz; + tile._data.coalesced.is_tiled = true; + return (tile); + } + + public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id; + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + _CG_STATIC_QUALIFIER void sync() { + details::cta::sync(); + } + +#if defined(_CG_CPP11_FEATURES) + struct arrival_token {}; + + _CG_QUALIFIER arrival_token barrier_arrive() const { + return arrival_token(); + } + + _CG_QUALIFIER void barrier_wait(arrival_token&&) const { + details::cta::sync(); + } +#endif + + _CG_STATIC_QUALIFIER unsigned int size() { + return details::cta::size(); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() { + return details::cta::thread_rank(); + } + + // Additional functionality exposed by the group + _CG_STATIC_QUALIFIER dim3 group_index() { + return details::cta::group_index(); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() { + return details::cta::thread_index(); + } + + _CG_STATIC_QUALIFIER dim3 group_dim() { + return details::cta::block_dim(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() { + return details::cta::dim_threads(); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() { + return details::cta::num_threads(); + } + +}; + +/** + * thread_block this_thread_block() + * + * Constructs a thread_block group + */ +_CG_QUALIFIER thread_block this_thread_block() +{ + return (thread_block()); +} + +#if defined(_CG_CPP11_FEATURES) +template +_CG_QUALIFIER thread_block this_thread_block(block_tile_memory& scratch) { + return (thread_block(scratch)); +} +#endif + +/** + * class coalesced_group + * + * A group representing the current set of converged threads in a warp. + * The size of the group is not guaranteed and it may return a group of + * only one thread (itself). + * + * This group exposes warp-synchronous builtins. + * Constructed via coalesced_threads(); + */ +class coalesced_group : public thread_group_base +{ +private: + friend _CG_QUALIFIER coalesced_group coalesced_threads(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz); + friend class details::_coalesced_group_data_access; + + _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const { + unsigned int member_pack = 0; + unsigned int member_rank = 0; + for (int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (laneMask & lane_bit) + member_pack |= 1 << member_rank; + member_rank++; + } + } + return (member_pack); + } + + // Internal Use + _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + details::abort(); + return (coalesced_group(0)); + } + if (size() <= tilesz) { + return (*this); + } + + if ((_data.coalesced.is_tiled == true) && pow2_tilesz) { + unsigned int base_offset = (thread_rank() & (~(tilesz - 1))); + unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); + unsigned int mask = (unsigned int)(-1) >> (32 - masklength); + + mask <<= (details::laneid() & ~(tilesz - 1)); + coalesced_group coalesced_tile = coalesced_group(mask); + coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz; + coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; + coalesced_tile._data.coalesced.is_tiled = true; + return (coalesced_tile); + } + else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) { + unsigned int mask = 0; + unsigned int member_rank = 0; + int seen_lanes = (thread_rank() / tilesz) * tilesz; + for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (seen_lanes <= 0 && member_rank < tilesz) { + mask |= lane_bit; + member_rank++; + } + seen_lanes--; + } + } + coalesced_group coalesced_tile = coalesced_group(mask); + // Override parent with the size of this group + coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz; + coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; + return coalesced_tile; + } + else { + // None in _CG_VERSION 1000 + details::abort(); + } + + return (coalesced_group(0)); + } + + protected: + _CG_QUALIFIER coalesced_group(unsigned int mask) { + _data.coalesced.mask = mask; + _data.coalesced.size = __popc(mask); + _data.coalesced.metaGroupRank = 0; + _data.coalesced.metaGroupSize = 1; + _data.coalesced.is_tiled = false; + } + + _CG_QUALIFIER unsigned int get_mask() const { + return (_data.coalesced.mask); + } + + public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + _CG_QUALIFIER unsigned int num_threads() const { + return _data.coalesced.size; + } + + _CG_QUALIFIER unsigned int size() const { + return num_threads(); + } + + _CG_QUALIFIER unsigned int thread_rank() const { + return (__popc(_data.coalesced.mask & details::lanemask32_lt())); + } + + // Rank of this group in the upper level of the hierarchy + _CG_QUALIFIER unsigned int meta_group_rank() const { + return _data.coalesced.metaGroupRank; + } + + // Total num partitions created out of all CTAs when the group was created + _CG_QUALIFIER unsigned int meta_group_size() const { + return _data.coalesced.metaGroupSize; + } + + _CG_QUALIFIER void sync() const { + __syncwarp(_data.coalesced.mask); + } + +#ifdef _CG_CPP11_FEATURES + template > + _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { + unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 : + (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1)); + + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), _data.coalesced.mask, lane, 32); + } + + template > + _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { + if (size() == 32) { + return details::tile::shuffle_dispatch::shfl_down( + _CG_STL_NAMESPACE::forward(elem), 0xFFFFFFFF, delta, 32); + } + + unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); + + if (lane >= 32) + lane = details::laneid(); + + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), _data.coalesced.mask, lane, 32); + } + + template > + _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const { + if (size() == 32) { + return details::tile::shuffle_dispatch::shfl_up( + _CG_STL_NAMESPACE::forward(elem), 0xFFFFFFFF, delta, 32); + } + + unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); + if (lane >= 32) + lane = details::laneid(); + + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), _data.coalesced.mask, lane, 32); + } +#else + template + _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const { + details::assert_if_not_arithmetic(); + unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : + (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } + + template + _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32)); + } + unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); + if (lane >= 32) lane = details::laneid(); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } + + template + _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32)); + } + unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); + if (lane >= 32) lane = details::laneid(); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } +#endif + + _CG_QUALIFIER int any(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) != 0); + } + _CG_QUALIFIER int all(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + if (size() == 32) { + return (__ballot_sync(0xFFFFFFFF, predicate)); + } + unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate); + return (_packLanes(lane_ballot)); + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + + template + _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__match_any_sync(0xFFFFFFFF, val)); + } + unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); + return (_packLanes(lane_match)); + } + + template + _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__match_all_sync(0xFFFFFFFF, val, &pred)); + } + unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); + return (_packLanes(lane_match)); + } + +#endif /* !_CG_HAS_MATCH_COLLECTIVE */ + +}; + +_CG_QUALIFIER coalesced_group coalesced_threads() +{ + return (coalesced_group(__activemask())); +} + +namespace details { + template struct verify_thread_block_tile_size; + template <> struct verify_thread_block_tile_size<32> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<16> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<8> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<4> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<2> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<1> { typedef void OK; }; + +#ifdef _CG_CPP11_FEATURES + template + using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant; + + template + using _is_single_warp = _CG_STL_NAMESPACE::integral_constant; + template + using _is_multi_warp = + _CG_STL_NAMESPACE::integral_constant 32) && (Size <= 1024)>; + + template + using _is_valid_single_warp_tile = + _CG_STL_NAMESPACE::integral_constant::value && _is_single_warp::value>; + template + using _is_valid_multi_warp_tile = + _CG_STL_NAMESPACE::integral_constant::value && _is_multi_warp::value>; +#else + template + struct _is_multi_warp { + static const bool value = false; + }; +#endif +} + +template +class __static_size_tile_base +{ +protected: + _CG_STATIC_CONST_DECL unsigned int numThreads = Size; + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + // Rank of thread within tile + _CG_STATIC_QUALIFIER unsigned int thread_rank() { + return (details::cta::thread_rank() & (numThreads - 1)); + } + + // Number of threads within tile + _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() { + return numThreads; + } + + _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() { + return num_threads(); + } +}; + +template +class __static_size_thread_block_tile_base : public __static_size_tile_base +{ + friend class details::_coalesced_group_data_access; + typedef details::tile::tile_helpers th; + +#ifdef _CG_CPP11_FEATURES + static_assert(details::_is_valid_single_warp_tile::value, "Size must be one of 1/2/4/8/16/32"); +#else + typedef typename details::verify_thread_block_tile_size::OK valid; +#endif + using __static_size_tile_base::numThreads; + _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF; + + protected: + _CG_STATIC_QUALIFIER unsigned int build_mask() { + unsigned int mask = fullMask; + if (numThreads != 32) { + // [0,31] representing the current active thread in the warp + unsigned int laneId = details::laneid(); + // shift mask according to the partition it belongs to + mask = th::tileMask << (laneId & ~(th::laneMask)); + } + return (mask); + } + +public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; + + _CG_STATIC_QUALIFIER void sync() { + __syncwarp(build_mask()); + } + +#ifdef _CG_CPP11_FEATURES + // PTX supported collectives + template > + _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), build_mask(), srcRank, numThreads); + } + + template > + _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { + return details::tile::shuffle_dispatch::shfl_down( + _CG_STL_NAMESPACE::forward(elem), build_mask(), delta, numThreads); + } + + template > + _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const { + return details::tile::shuffle_dispatch::shfl_up( + _CG_STL_NAMESPACE::forward(elem), build_mask(), delta, numThreads); + } + + template > + _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const { + return details::tile::shuffle_dispatch::shfl_xor( + _CG_STL_NAMESPACE::forward(elem), build_mask(), laneMask, numThreads); + } +#else + template + _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const { + details::assert_if_not_arithmetic(); + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + + template + _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const { + details::assert_if_not_arithmetic(); + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + + template + _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const { + details::assert_if_not_arithmetic(); + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + + template + _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const { + details::assert_if_not_arithmetic(); + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } +#endif //_CG_CPP11_FEATURES + + _CG_QUALIFIER int any(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot != 0); + } + _CG_QUALIFIER int all(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot == build_mask()); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot >> (details::laneid() & (~(th::laneMask)))); + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + template + _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { + details::assert_if_not_arithmetic(); + unsigned int lane_match = __match_any_sync(build_mask(), val); + return (lane_match >> (details::laneid() & (~(th::laneMask)))); + } + + template + _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { + details::assert_if_not_arithmetic(); + unsigned int lane_match = __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (details::laneid() & (~(th::laneMask)))); + } +#endif + +}; + +template +class __static_parent_thread_block_tile_base +{ +public: + // Rank of this group in the upper level of the hierarchy + _CG_STATIC_QUALIFIER unsigned int meta_group_rank() { + return ParentT::thread_rank() / Size; + } + + // Total num partitions created out of all CTAs when the group was created + _CG_STATIC_QUALIFIER unsigned int meta_group_size() { + return (ParentT::size() + Size - 1) / Size; + } +}; + +/** + * class thread_block_tile + * + * Statically-sized group type, representing one tile of a thread block. + * The only specializations currently supported are those with native + * hardware support (1/2/4/8/16/32) + * + * This group exposes warp-synchronous builtins. + * Can only be constructed via tiled_partition(ParentT&) + */ + +template +class __single_warp_thread_block_tile : + public __static_size_thread_block_tile_base, + public __static_parent_thread_block_tile_base +{ + typedef __static_parent_thread_block_tile_base staticParentBaseT; + friend class details::_coalesced_group_data_access; + +protected: + _CG_QUALIFIER __single_warp_thread_block_tile() { }; + _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { }; + + _CG_STATIC_QUALIFIER unsigned int get_mask() { + return __static_size_thread_block_tile_base::build_mask(); + } +}; + +template +class __single_warp_thread_block_tile : + public __static_size_thread_block_tile_base, + public thread_group_base +{ + _CG_STATIC_CONST_DECL unsigned int numThreads = Size; + + template friend class __single_warp_thread_block_tile; + friend class details::_coalesced_group_data_access; + + typedef __static_size_thread_block_tile_base staticSizeBaseT; + +protected: + _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) { + _data.coalesced.mask = staticSizeBaseT::build_mask(); + _data.coalesced.size = numThreads; + _data.coalesced.metaGroupRank = meta_group_rank; + _data.coalesced.metaGroupSize = meta_group_size; + _data.coalesced.is_tiled = true; + } + + _CG_QUALIFIER unsigned int get_mask() const { + return (_data.coalesced.mask); + } + +public: + using staticSizeBaseT::sync; + using staticSizeBaseT::size; + using staticSizeBaseT::num_threads; + using staticSizeBaseT::thread_rank; + + _CG_QUALIFIER unsigned int meta_group_rank() const { + return _data.coalesced.metaGroupRank; + } + + _CG_QUALIFIER unsigned int meta_group_size() const { + return _data.coalesced.metaGroupSize; + } +}; + +/** + * Outer level API calls + * void sync(GroupT) - see .sync() + * void thread_rank(GroupT) - see .thread_rank() + * void group_size(GroupT) - see .size() + */ +template +_CG_QUALIFIER void sync(GroupT const &g) +{ + g.sync(); +} + +// TODO: Use a static dispatch to determine appropriate return type +// C++03 is stuck with unsigned long long for now +#ifdef _CG_CPP11_FEATURES +template +_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) { + return g.thread_rank(); +} + + +template +_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) { + return g.num_threads(); +} +#else +template +_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) { + return static_cast(g.thread_rank()); +} + + +template +_CG_QUALIFIER unsigned long long group_size(GroupT const &g) { + return static_cast(g.num_threads()); +} +#endif + + +/** + * tiled_partition + * + * The tiled_partition(parent, tilesz) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will + * be created where threads having identical k = (thread_rank(parent)/tilesz) + * will be members of the same subgroup. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to power-of-two sized subgorup instances of at most + * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be + * tiled_partition() in _CG_VERSION 1000. + */ +_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz) +{ + if (parent.get_type() == details::coalesced_group_id) { + const coalesced_group *_cg = static_cast(&parent); + return _cg->_get_tiled_threads(tilesz); + } + else { + const thread_block *_tb = static_cast(&parent); + return _tb->_get_tiled_threads(tilesz); + } +} + +// Thread block type overload: returns a basic thread_group for now (may be specialized later) +_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} + +// Coalesced group type overload: retains its ability to stay coalesced +_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} + +namespace details { + template + class internal_thread_block_tile : public __single_warp_thread_block_tile {}; + + template + _CG_QUALIFIER internal_thread_block_tile tiled_partition_internal() { + return internal_thread_block_tile(); + } + + template + _CG_QUALIFIER TyVal multi_warp_collectives_helper( + const GroupT& group, + WarpLambda warp_lambda, + InterWarpLambda inter_warp_lambda) { + return group.template collectives_scheme(warp_lambda, inter_warp_lambda); + } + + template + _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) { + return group.template get_scratch_location(warp_id); + } + + template + _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) { + return group.get_sync_location(); + } + +} +/** + * tiled_partition + * + * The tiled_partition(parent) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)/tilesz) subgroups will be created, + * therefore the parent group size must be evenly divisible by the tilesz. + * The allow parent groups are thread_block or thread_block_tile. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to native hardware sizes, 1/2/4/8/16/32. + * The size(parent) must be greater than the template Size parameter + * otherwise the results are undefined. + */ + +#if defined(_CG_CPP11_FEATURES) +template +class __static_size_multi_warp_tile_base : public __static_size_tile_base +{ + static_assert(details::_is_valid_multi_warp_tile::value, "Size must be one of 64/128/256/512"); + + template + friend __device__ TyVal details::multi_warp_collectives_helper( + const GroupT& group, + WarpLambda warp_lambda, + InterWarpLambda inter_warp_lambda); + template + friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id); + template + friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group); + template + friend class __static_size_multi_warp_tile_base; + using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base>; + using ThisType = __static_size_multi_warp_tile_base; + _CG_STATIC_CONST_DECL int numWarps = Size / 32; + +protected: + details::multi_warp_scratch* const tile_memory; + + template + _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) { +#if defined(_CG_HAS_RESERVED_SHARED) + details::sync_warps_reset(get_sync_location(), details::cta::thread_rank()); + g.sync(); +#endif + } + + +private: + _CG_QUALIFIER details::barrier_t* get_sync_location() const { + // Different group sizes use different barriers, all groups of a given size share one barrier. + unsigned int sync_id = details::log2(Size / 64); + return &tile_memory->barriers[sync_id]; + } + + template + _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const { + unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id; + return reinterpret_cast(&tile_memory->communication_memory[scratch_id]); + } + + template + _CG_QUALIFIER T* get_scratch_location() const { + unsigned int scratch_id = details::cta::thread_rank() / 32; + return reinterpret_cast(&tile_memory->communication_memory[scratch_id]); + } + + template + _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const { + unsigned int src_warp = src / 32; + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + + // Get warp slot of the source threads warp. + TyVal* warp_scratch_location = get_scratch_location(src_warp); + + if (warp.meta_group_rank() == src_warp) { + warp.sync(); + // Put shuffled value into my warp slot and let my warp arrive at the barrier. + if (thread_rank() == src) { + *warp_scratch_location = val; + } + details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps); + TyVal result = *warp_scratch_location; + details::sync_warps_wait(sync_location, details::cta::thread_rank()); + return result; + } + else { + // Wait for the source warp to arrive on the barrier. + details::sync_warps_wait_for_specific_warp(sync_location, + (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp)); + TyVal result = *warp_scratch_location; + details::sync_warps(sync_location, details::cta::thread_rank(), numWarps); + return result; + } + } + + template + _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const { + static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size, + "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes"); + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + TyVal* warp_scratch_location = get_scratch_location(); + + warp_lambda(warp, warp_scratch_location); + + if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) { + auto subwarp = details::tiled_partition_internal(); + if (subwarp.meta_group_rank() == 0) { + TyVal* thread_scratch_location = get_scratch_location(subwarp.thread_rank()); + inter_warp_lambda(subwarp, thread_scratch_location); + } + warp.sync(); + details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps); + } + TyVal result = *warp_scratch_location; + return result; + } + +public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id; + + using __static_size_tile_base::thread_rank; + + template + _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const { + static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size, + "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes"); + return shfl_impl(val, src); + } + + _CG_QUALIFIER void sync() const { + details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps); + } + + _CG_QUALIFIER int any(int predicate) const { + auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { + *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate); + }; + auto inter_warp_lambda = + [] (details::internal_thread_block_tile& subwarp, int* thread_scratch_location) { + *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); + }; + return collectives_scheme(warp_lambda, inter_warp_lambda); + } + + _CG_QUALIFIER int all(int predicate) const { + auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { + *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate); + }; + auto inter_warp_lambda = + [] (details::internal_thread_block_tile& subwarp, int* thread_scratch_location) { + *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); + }; + return collectives_scheme(warp_lambda, inter_warp_lambda); + } +}; + + +template +class __multi_warp_thread_block_tile : + public __static_size_multi_warp_tile_base, + public __static_parent_thread_block_tile_base +{ + typedef __static_parent_thread_block_tile_base staticParentBaseT; + typedef __static_size_multi_warp_tile_base staticTileBaseT; +protected: + _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) : + __static_size_multi_warp_tile_base(g) {} +}; + +template +class __multi_warp_thread_block_tile : public __static_size_multi_warp_tile_base +{ + const unsigned int metaGroupRank; + const unsigned int metaGroupSize; + +protected: + template + _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile& g) : + __static_size_multi_warp_tile_base(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {} + +public: + _CG_QUALIFIER unsigned int meta_group_rank() const { + return metaGroupRank; + } + + _CG_QUALIFIER unsigned int meta_group_size() const { + return metaGroupSize; + } +}; +#endif + +template +class thread_block_tile; + +namespace details { + template + class thread_block_tile_impl; + + template + class thread_block_tile_impl: public __single_warp_thread_block_tile + { + protected: + template + _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl& g) : + __single_warp_thread_block_tile(g.meta_group_rank(), g.meta_group_size()) {} + + _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) : + __single_warp_thread_block_tile() {} + }; + +#if defined(_CG_CPP11_FEATURES) + template + class thread_block_tile_impl : public __multi_warp_thread_block_tile + { + protected: + template + _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) : + __multi_warp_thread_block_tile(g) {} + }; +#else + template + class thread_block_tile_impl + { + protected: + template + _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {} + }; +#endif +} + +template +class thread_block_tile : public details::thread_block_tile_impl::value> +{ + friend _CG_QUALIFIER thread_block_tile<1, void> this_thread(); + +protected: + _CG_QUALIFIER thread_block_tile(const ParentT& g) : + details::thread_block_tile_impl::value>(g) {} + +public: + _CG_QUALIFIER operator thread_block_tile() const { + return thread_block_tile(*this); + } +}; + +template +class thread_block_tile : public details::thread_block_tile_impl::value> +{ + template + friend class thread_block_tile; + +protected: + template + _CG_QUALIFIER thread_block_tile(const thread_block_tile& g) : + details::thread_block_tile_impl::value>(g) {} + +public: + template + _CG_QUALIFIER thread_block_tile(const thread_block_tile& g) : + details::thread_block_tile_impl::value>(g) {} +}; + +namespace details { + template + struct tiled_partition_impl; + + template + struct tiled_partition_impl : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(const thread_block& g) : + thread_block_tile(g) {} + }; + + // ParentT = static thread_block_tile specialization + template + struct tiled_partition_impl > : + public thread_block_tile > { +#ifdef _CG_CPP11_FEATURES + static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size"); +#endif + _CG_QUALIFIER tiled_partition_impl(const thread_block_tile& g) : + thread_block_tile >(g) {} + }; + +} + +template +_CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g) +{ + return details::tiled_partition_impl(g); +} + +/** + * thread_group this_thread() + * + * Constructs a generic thread_group containing only the calling thread + */ +_CG_QUALIFIER thread_block_tile<1, void> this_thread() +{ + // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its + // meta group rank and size set to 0 and 1 respectively. + return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block()); +} + +/** + * .sync() + * + * Executes a barrier across the group + * + * Implements both a compiler fence and an architectural fence to prevent, + * memory reordering around the barrier. + */ +_CG_QUALIFIER void thread_group::sync() const +{ + switch (_data.group.type) { + case details::coalesced_group_id: + cooperative_groups::sync(*static_cast(this)); + break; + case details::thread_block_id: + cooperative_groups::sync(*static_cast(this)); + break; + case details::grid_group_id: + cooperative_groups::sync(*static_cast(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + cooperative_groups::sync(*static_cast(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + cooperative_groups::sync(*static_cast(this)); + break; +#endif + default: + break; + } +} + +/** + * .size() + * + * Returns the total number of threads in the group. + */ +_CG_QUALIFIER unsigned long long thread_group::size() const +{ + unsigned long long size = 0; + switch (_data.group.type) { + case details::coalesced_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; + case details::thread_block_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; + case details::grid_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; +#endif + default: + break; + } + return size; +} + +/** + * .thread_rank() + * + * Returns the linearized rank of the calling thread along the interval [0, size()). + */ +_CG_QUALIFIER unsigned long long thread_group::thread_rank() const +{ + unsigned long long rank = 0; + switch (_data.group.type) { + case details::coalesced_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; + case details::thread_block_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; + case details::grid_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; +#endif + default: + break; + } + return rank; +} + +_CG_END_NAMESPACE + +#include +#if (!defined(_MSC_VER) || defined(_WIN64)) +# include +#endif + +# endif /* ! (__cplusplus, __CUDACC__) */ + +#endif /* !_COOPERATIVE_GROUPS_H_ */ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h new file mode 100644 index 0000000000000000000000000000000000000000..1b7dcb2433f2cb7d1ef61290995ac871a901b1e8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h @@ -0,0 +1,452 @@ +/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_ASYNC_H +#define _CG_ASYNC_H + +#include "helpers.h" +#include "info.h" + +#include + +_CG_BEGIN_NAMESPACE + +namespace details { +// Groups supported by memcpy_async +template +struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {}; + +template +struct _async_copy_group_supported> + : public _CG_STL_NAMESPACE::true_type {}; +template <> +struct _async_copy_group_supported : public _CG_STL_NAMESPACE::true_type {}; +template <> +struct _async_copy_group_supported : public _CG_STL_NAMESPACE::true_type {}; + +template +using async_copy_group_supported = _async_copy_group_supported>; + +// Groups that require optimization +template +struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {}; + +template +struct _async_copy_optimize_tile> + : public _CG_STL_NAMESPACE::false_type {}; + +template +struct _async_copy_optimize_tile> + : public _CG_STL_NAMESPACE::true_type {}; + +template +using async_copy_optimize_tile = _async_copy_optimize_tile>; + +// SFINAE helpers for tile optimizations +template +using enable_tile_optimization = + typename _CG_STL_NAMESPACE::enable_if::value, void *>::type; + +template +using disable_tile_optimization = + typename _CG_STL_NAMESPACE::enable_if::value, void *>::type; + +// Segment for punning to aligned types +template +struct _Segment { + int _seg[N]; +}; + +// Trivial layout guaranteed-aligned copy-async compatible segments +template +struct Segment; +template <> +struct __align__(4) Segment<1> : public _Segment<1>{}; +template <> +struct __align__(8) Segment<2> : public _Segment<2>{}; +template <> +struct __align__(16) Segment<4> : public _Segment<4>{}; + +// Interleaved element by element copies from source to dest +template +_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src, + size_t count) { + const unsigned int rank = group.thread_rank(); + const unsigned int stride = group.size(); + + for (size_t idx = rank; idx < count; idx += stride) { + dst[idx] = src[idx]; + } +} + +template = nullptr> +_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst, + const TyElem *__restrict__ src, size_t count) { + static_assert(async_copy_group_supported::value, + "Async copy is only supported for groups that represent private shared memory"); + + if (count == 0) { + return; + } + + const bool dstIsNotShared = !__isShared(dst); + const bool srcIsNotGlobal = !__isGlobal(src); + + if (dstIsNotShared || srcIsNotGlobal) { + inline_copy(group, dst, src, count); + return; + } + + const unsigned int stride = group.size(); + const unsigned int rank = group.thread_rank(); + // Efficient copies require warps to operate on the same amount of work at each step. + // remainders are handled in a separate stage to prevent branching + const unsigned int subWarpMask = (stride - 1); + const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count); + const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1); + + const size_t warpCopies = (count & (~subWarpMask)); + + for (size_t idx = 0; idx < warpCopies; idx += stride) { + size_t _srcIdx = rank + idx; + size_t _dstIdx = rank + idx; + __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem)); + } + + if (subwarpCopies) { + size_t _srcIdx = warpCopies + maxSubwarpRank; + size_t _dstIdx = warpCopies + maxSubwarpRank; + __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem)); + } +} + +template = nullptr> +_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst, + const TyElem *__restrict__ src, size_t count) { + static_assert(async_copy_group_supported::value, + "Async copy is only supported for groups that represent private shared memory"); + + const bool dstIsNotShared = !__isShared(dst); + const bool srcIsNotGlobal = !__isGlobal(src); + + if (dstIsNotShared || srcIsNotGlobal) { + inline_copy(group, dst, src, count); + return; + } + + unsigned int stride = group.size(); + unsigned int rank = group.thread_rank(); + + for (size_t idx = rank; idx < count; idx += stride) { + size_t _srcIdx = idx; + size_t _dstIdx = idx; + __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem)); + } +} + +// Determine best possible alignment given an input and initial conditions +// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments +template +_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) { + // Narrowing conversion intentional + uint32_t base1 = (uint32_t) reinterpret_cast(src); + uint32_t base2 = (uint32_t) reinterpret_cast(dst); + + uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1); + + // range [MaxAlignment, alignof(elem)], step: x >> 1 + // over range of possible alignments, choose best available out of range + uint32_t out = MaxAlignment; +#pragma unroll + for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) { + if (alignment & diff) + out = alignment; + } + + return out; +} + +// Determine best possible alignment given an input and initial conditions +// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments +template +_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const char *src = reinterpret_cast(_src); + char *dst = reinterpret_cast(_dst); + + constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType); + + uint32_t base = (uint32_t) reinterpret_cast(src); + uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1); + + inline_copy(group, dst, src, alignOffset); + count -= alignOffset; + src += alignOffset; + dst += alignOffset; + + // Copy using the best available alignment, async_copy expects n-datums, not bytes + size_t asyncCount = count / sizeof(TyType); + accelerated_async_copy(group, reinterpret_cast(dst), reinterpret_cast(src), asyncCount); + asyncCount *= sizeof(TyType); + + count -= asyncCount; + src += asyncCount; + dst += asyncCount; + inline_copy(group, dst, src, count); +} + +// We must determine alignment and manually align src/dst ourselves +template +struct _memcpy_async_align_dispatch { + template + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) { + uint32_t alignment = find_best_alignment(dst, src); + + // Avoid copying the extra bytes if desired copy count is smaller + alignment = count < alignment ? AlignHint : alignment; + + switch (alignment) { + default: + case 1: + inline_copy(group, reinterpret_cast(dst), reinterpret_cast(src), count); + break; + case 2: + inline_copy(group, reinterpret_cast(dst), reinterpret_cast(src), count >> 1); + break; + case 4: + copy_like>(group, dst, src, count); + break; + case 8: + copy_like>(group, dst, src, count); + break; + case 16: + copy_like>(group, dst, src, count); + break; + } + } +}; + +// Specialization for 4 byte alignments +template <> +struct _memcpy_async_align_dispatch<4> { + template + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const Segment<1> *src = reinterpret_cast *>(_src); + Segment<1> *dst = reinterpret_cast *>(_dst); + + // Dispatch straight to aligned LDGSTS calls + accelerated_async_copy(group, dst, src, count / sizeof(*dst)); + } +}; + +// Specialization for 8 byte alignments +template <> +struct _memcpy_async_align_dispatch<8> { + template + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const Segment<2> *src = reinterpret_cast *>(_src); + Segment<2> *dst = reinterpret_cast *>(_dst); + + // Dispatch straight to aligned LDGSTS calls + accelerated_async_copy(group, dst, src, count / sizeof(*dst)); + } +}; + +// Alignments over 16 are truncated to 16 and bypass alignment +// This is the highest performing memcpy available +template <> +struct _memcpy_async_align_dispatch<16> { + template + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const Segment<4> *src = reinterpret_cast *>(_src); + Segment<4> *dst = reinterpret_cast *>(_dst); + + // Dispatch straight to aligned LDGSTS calls + accelerated_async_copy(group, dst, src, count / sizeof(*dst)); + } +}; + +// byte-wide API +template +_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst, + const void *__restrict__ _src, size_t count) { + static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2"); + details::_memcpy_async_align_dispatch::copy(group, _dst, _src, count); +} + +// Internal dispatch APIs +// These deduce the alignments and sizes necessary to invoke the underlying copy engine +template +using is_void = _CG_STL_NAMESPACE::is_same; + +template +using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if::value, void *>::type; + +template +using enable_if_void = typename _CG_STL_NAMESPACE::enable_if::value, void *>::type; + +template +using enable_if_integral = + typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral::value, void *>::type; + +// byte-wide API using aligned_sized_t +template typename Alignment, size_t Hint> +_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst, + const void *__restrict__ _src, const Alignment &count) { + constexpr size_t _align = (Hint > 16) ? 16 : Hint; + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count); +} + +// byte-wide API using type for aligment +template = nullptr, enable_if_integral = nullptr> +_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst, + const TyElem *__restrict__ _src, const TySize& count) { + constexpr size_t _align = (Hint > 16) ? 16 : Hint; + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count); +} + +// byte-wide API with full alignment deduction required +template = nullptr, + enable_if_integral = nullptr> +_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst, + const TyElem *__restrict__ _src, const TySize& count) { + details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count); +} + +// 1d-datum API +template +_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount, + const TyElem *__restrict__ src, const size_t srcCount) { + constexpr unsigned int _align = Hint; + const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem); + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount); +} + +// 1d-datum API using aligned_size_t +template typename Alignment, size_t Hint> +_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment &dstCount, + const TyElem *__restrict__ src, const Alignment &srcCount) { + constexpr unsigned int _align = Hint; + const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem); + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount); +} + +} // namespace details + +/* + * Group submit batch of async-copy to cover contiguous 1D array + * and commit that batch to eventually wait for completion. + */ +template +_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src, + const TySizeT &count) { + details::_memcpy_async_bytes(group, _dst, _src, count); + __pipeline_commit(); +} + +/* + * Group submit batch of async-copy to cover contiguous 1D array + * and commit that batch to eventually wait for completion. + * Object counts are in datum sized chunks, not bytes. + */ +template +_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout, + const TyElem *__restrict__ src, const SrcLayout &srcLayout) { + details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout); + __pipeline_commit(); +} + +/* Group wait for prior Nth stage of memcpy_async to complete. */ +template +_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) { + __pipeline_wait_prior(Stage); + group.sync(); +} + +/* Group wait all previously submitted memcpy_async to complete. */ +template +_CG_STATIC_QUALIFIER void wait(const TyGroup &group) { + __pipeline_wait_prior(0); + group.sync(); +} + +/***************** CG APIs including pipeline are deprecated *****************/ + +/* Group submit batch of async-copy to cover of contiguous 1D array + to a pipeline and commit the batch*/ +template +_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount, + nvcuda::experimental::pipeline &pipe) { + details::_memcpy_async_datum(group, dst, dstCount, src, srcCount); + pipe.commit(); +} + +/* Group wait for prior Nth stage of memcpy_async to complete. */ +template +_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) { + pipe.wait_prior(); + group.sync(); +} + +/* Group wait for stage-S of memcpy_async to complete. */ +template +_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) { + pipe.wait(stage); + group.sync(); +} +_CG_END_NAMESPACE + +#endif // _CG_ASYNC_H diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..7ba03fc9e4d0c78f07e3e5e1f97aff03e7a3d6f8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h @@ -0,0 +1,95 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_COALESCED_REDUCE_H_ +#define _CG_COALESCED_REDUCE_H_ + +#include "info.h" +#include "helpers.h" +#include "cooperative_groups.h" +#include "partitioning.h" +#include "coalesced_scan.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + +template +_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile& group, + TyVal&& val, + TyOp&& op) -> decltype(op(val, val)) { + auto out = val; + for (int mask = TySize >> 1; mask > 0; mask >>= 1) { + out = op(out, group.shfl_xor(out, mask)); + } + + return out; +} + +template +_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + if (group.size() == 32) { + // Full coalesced group can go through faster path by being treated as a tile of size 32 + auto tile = details::tiled_partition_internal<32, void>(); + return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + } + else { + auto scan_result = + inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + unsigned int group_mask = _coalesced_group_data_access::get_mask(group); + unsigned int last_thread_id = 31 - __clz(group_mask); + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(scan_result), group_mask, last_thread_id, 32); + } +} + +} // details + +_CG_END_NAMESPACE + +#endif // _CG_COALESCED_REDUCE_H_ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h new file mode 100644 index 0000000000000000000000000000000000000000..0f151fe2c270421ba56e22935e84c4bf93790eff --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h @@ -0,0 +1,212 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_FUNCTIONAL_H +#define _CG_FUNCTIONAL_H + +#include "info.h" +#include "helpers.h" + +#ifdef _CG_CPP11_FEATURES +#ifdef _CG_USE_CUDA_STL +# include +#endif + +_CG_BEGIN_NAMESPACE + +namespace details { +#ifdef _CG_USE_CUDA_STL + using cuda::std::plus; + using cuda::std::bit_and; + using cuda::std::bit_xor; + using cuda::std::bit_or; +#else + template struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}}; + template struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}}; + template struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}}; + template struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}}; +#endif // _CG_USE_PLATFORM_STL +} // details + +template +struct plus : public details::plus {}; + +template +struct less { + __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const { + return (arg2 < arg1) ? arg2 : arg1; + } +}; + +template +struct greater { + __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const { + return (arg1 < arg2) ? arg2 : arg1; + } +}; + +template +struct bit_and : public details::bit_and {}; + +template +struct bit_xor : public details::bit_xor {}; + +template +struct bit_or : public details::bit_or {}; + +#if defined(_CG_HAS_STL_ATOMICS) +namespace details { + template + using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>; + + template struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + + template + _CG_QUALIFIER remove_qual atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) { + auto old = atomic.load(cuda::std::memory_order_relaxed); + while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed)); + return old; + } + + template + struct op_picker; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_add(val, cuda::std::memory_order_relaxed); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_min(val, cuda::std::memory_order_relaxed); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_max(val, cuda::std::memory_order_relaxed); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_and(val, cuda::std::memory_order_relaxed); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_xor(val, cuda::std::memory_order_relaxed); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_or(val, cuda::std::memory_order_relaxed); + } + }; + + template + struct atomic_update_dispatch {}; + + template<> + struct atomic_update_dispatch { + template + _CG_STATIC_QUALIFIER remove_qual atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) { + return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + } + }; + + template<> + struct atomic_update_dispatch { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) { + using dispatch = op_picker>; + + return dispatch::atomic_update(atomic, val); + } + }; + + template + _CG_QUALIFIER remove_qual atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) { + using dispatch = atomic_update_dispatch<_atomic_op_supported>::value>; + + return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + } + + template + _CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) { + atomic.store(val, cuda::std::memory_order_relaxed); + } +} +#endif + +_CG_END_NAMESPACE + +#endif +#endif //_CG_FUNCTIONAL_H diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..1485d9f503daa8d518af75775f7a7a415cb031d4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h @@ -0,0 +1,693 @@ + /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_HELPERS_H_ +# define _COOPERATIVE_GROUPS_HELPERS_H_ + +#include "info.h" +#include "sync.h" + +_CG_BEGIN_NAMESPACE + +namespace details { +#ifdef _CG_CPP11_FEATURES + template struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point {}; +# ifdef _CG_HAS_FP16_COLLECTIVE + template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {}; + template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {}; +# endif + template + using is_float_or_half = _is_float_or_half::type>; + + // Non-STL utility templates + template + using remove_qual = typename _CG_STL_NAMESPACE::remove_cv::type>::type; + + template + using is_op_type_same = _CG_STL_NAMESPACE::is_same, remove_qual + >; +#endif + + template + _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) { + return ((TyTrunc)index.z * nIndex.y * nIndex.x) + + ((TyTrunc)index.y * nIndex.x) + + (TyTrunc)index.x; + } + + namespace cta { + + _CG_STATIC_QUALIFIER void sync() + { + __barrier_sync(0); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return static_cast(blockDim.x * blockDim.y * blockDim.z); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return vec3_to_linear(threadIdx, blockDim); + } + + _CG_STATIC_QUALIFIER dim3 group_index() + { + return dim3(blockIdx.x, blockIdx.y, blockIdx.z); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() + { + return dim3(threadIdx.x, threadIdx.y, threadIdx.z); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + return dim3(blockDim.x, blockDim.y, blockDim.z); + } + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned int size() + { + return num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 block_dim() + { + return dim_threads(); + } + + }; + + class _coalesced_group_data_access { + public: + // Retrieve mask of coalesced groups and tiles + template + _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) { + return group.get_mask(); + } + + template + _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) { + return TyGroup(mask); + } + + template + _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) { + group._data.coalesced.metaGroupRank = mgRank; + group._data.coalesced.metaGroupSize = mgSize; + } + }; + + namespace tile { + template + struct _tile_helpers{ + _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount; + _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask; + _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask; + _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount; + }; + + template struct tile_helpers; + template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {}; + template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {}; + template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {}; + template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {}; + template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {}; + template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {}; + +#ifdef _CG_CPP11_FEATURES + namespace shfl { + /*********************************************************************************** + * Recursively Sliced Shuffle + * Purpose: + * Slices an input type a number of times into integral types so that shuffles + * are well defined + * Expectations: + * This object *should not* be used from a reinterpret_cast pointer unless + * some alignment guarantees can be met. Use a memcpy to guarantee that loads + * from the integral types stored within are aligned and correct. + **********************************************************************************/ + template + struct recursive_sliced_shuffle_helper; + + template + struct recursive_sliced_shuffle_helper { + int val; + + template + _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) { + val = shfl(val); + } + }; + + template + struct recursive_sliced_shuffle_helper { + int val; + recursive_sliced_shuffle_helper next; + + template + _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) { + val = shfl(val); + next.invoke_shuffle(shfl); + } + }; + } + + struct _memory_shuffle { + template + _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) { + static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B"); + return TyElem{}; + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + }; + + /*********************************************************************************** + * Intrinsic Device Function Shuffle + * Purpose: + * Uses a shuffle helper that has characteristics best suited for moving + * elements between threads + * Expectations: + * Object given will be forced into an l-value type so that it can be used + * with a helper structure that reinterprets the data into intrinsic compatible + * types + * Notes: + * !! TyRet is required so that objects are returned by value and not as + * dangling references depending on the value category of the passed object + **********************************************************************************/ + struct _intrinsic_compat_shuffle { + template + using shfl_helper = shfl::recursive_sliced_shuffle_helper; + + template + _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) { + static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle"); + shfl_helper helper; + memcpy(&helper, &elem, sizeof(TyElem)); + helper.invoke_shuffle(fn); + memcpy(&elem, &helper, sizeof(TyElem)); + return elem; + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_sync(gMask, val, srcRank, threads); + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_down_sync(gMask, val, delta, threads); + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_up_sync(gMask, val, delta, threads); + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + + template > + _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_xor_sync(gMask, val, lMask, threads); + }; + + return _shfl_internal(_CG_STL_NAMESPACE::forward(elem), shfl); + } + }; + + struct _native_shuffle { + template + _CG_STATIC_QUALIFIER TyElem shfl( + TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) { + return static_cast(__shfl_sync(gMask, elem, srcRank, threads)); + } + + template + _CG_STATIC_QUALIFIER TyElem shfl_down( + TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + return static_cast(__shfl_down_sync(gMask, elem, delta, threads)); + } + + template + _CG_STATIC_QUALIFIER TyElem shfl_up( + TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + return static_cast(__shfl_up_sync(gMask, elem, delta, threads)); + } + + template + _CG_STATIC_QUALIFIER TyElem shfl_xor( + TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) { + return static_cast(__shfl_xor_sync(gMask, elem, lMask, threads)); + } + }; + + // Almost all arithmetic types are supported by native shuffle + // Vector types are the exception + template + using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant< + bool, + _CG_STL_NAMESPACE::is_integral< + remove_qual>::value || + details::is_float_or_half< + remove_qual>::value + >; + + constexpr unsigned long long _MemoryShuffleCutoff = 32; + + template ::value, + bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)> + struct shuffle_dispatch; + + template + struct shuffle_dispatch : public _native_shuffle {}; + + template + struct shuffle_dispatch : public _intrinsic_compat_shuffle {}; + + template + struct shuffle_dispatch : public _memory_shuffle {}; + +#endif //_CG_CPP11_FEATURES + }; + + namespace multi_grid { + struct multi_grid_functions; + }; + + namespace grid { + _CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) { + return details::sync_grids_arrive(bar); + } + + _CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) { + details::sync_grids_wait(token, bar); + } + + _CG_STATIC_QUALIFIER void sync(unsigned int *bar) { + unsigned int token = details::sync_grids_arrive(bar); + details::sync_grids_wait(token, bar); + } + + _CG_STATIC_QUALIFIER unsigned long long num_blocks() + { + // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication + // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication + return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z); + } + + _CG_STATIC_QUALIFIER unsigned long long num_threads() + { + return num_blocks() * cta::num_threads(); + } + + _CG_STATIC_QUALIFIER unsigned long long block_rank() + { + return vec3_to_linear(blockIdx, gridDim); + } + + _CG_STATIC_QUALIFIER unsigned long long thread_rank() + { + return block_rank() * cta::num_threads() + cta::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() + { + return dim3(gridDim.x, gridDim.y, gridDim.z); + } + + _CG_STATIC_QUALIFIER dim3 block_index() + { + return dim3(blockIdx.x, blockIdx.y, blockIdx.z); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() + { + return dim3(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z); + } + +#if defined(_CG_HAS_CLUSTER_GROUP) + _CG_STATIC_QUALIFIER dim3 dim_clusters() { + return __clusterGridDimInClusters(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_clusters() { + const dim3 dimClusters = dim_clusters(); + return dimClusters.x * dimClusters.y * dimClusters.z; + } + + _CG_STATIC_QUALIFIER dim3 cluster_index() { + return __clusterIdx(); + } + + _CG_STATIC_QUALIFIER unsigned long long cluster_rank() { + return vec3_to_linear(cluster_index(), dim_clusters()); + } +#endif + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned long long size() + { + return num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 grid_dim() + { + return dim_blocks(); + } + }; + + +#if defined(_CG_HAS_MULTI_GRID_GROUP) + + namespace multi_grid { + _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle() + { +#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) + //this function is defined in device runtime library + //which requires separate compilation mode (__CUDACC_RDC__) + //or extended whole program mode (__CUDACC_EWP__) + return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid)); +#else /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */ + return 0; +#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */ + } + + _CG_STATIC_QUALIFIER void sync(const unsigned long long handle) + { +#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) + //this function is defined in device runtime library + //which requires separate compilation mode (__CUDACC_RDC__) + //or extended whole program mode (__CUDACC_EWP__) + cudaError_t err = cudaCGSynchronize(handle, 0); +#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */ + } + + _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle) + { + unsigned int numThreads = 0; +#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) + //this function is defined in device runtime library + //which requires separate compilation mode (__CUDACC_RDC__) + //or extended whole program mode (__CUDACC_EWP__) + cudaCGGetSize(&numThreads, NULL, handle); +#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */ + return numThreads; + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle) + { + unsigned int threadRank = 0; +#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) + //this function is defined in device runtime library + //which requires separate compilation mode (__CUDACC_RDC__) + //or extended whole program mode (__CUDACC_EWP__) + cudaCGGetRank(&threadRank, NULL, handle); +#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */ + return threadRank; + } + + _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle) + { + unsigned int gridRank = 0; +#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) + //this function is defined in device runtime library + //which requires separate compilation mode (__CUDACC_RDC__) + //or extended whole program mode (__CUDACC_EWP__) + cudaCGGetRank(NULL, &gridRank, handle); +#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */ + return gridRank; + } + + _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle) + { + unsigned int numGrids = 0; +#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) + //this function is defined in device runtime library + //which requires separate compilation mode (__CUDACC_RDC__) + //or extended whole program mode (__CUDACC_EWP__) + cudaCGGetSize(NULL, &numGrids, handle); +#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */ + return numGrids; + } + +# ifdef _CG_CPP11_FEATURES + struct multi_grid_functions { + decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle; + decltype(multi_grid::sync) *sync; + decltype(multi_grid::size) *size; + decltype(multi_grid::thread_rank) *thread_rank; + decltype(multi_grid::grid_rank) *grid_rank; + decltype(multi_grid::num_grids) *num_grids; + }; + + template + _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() { + __constant__ static const multi_grid_functions mgf { + &multi_grid::get_intrinsic_handle, + &multi_grid::sync, + &multi_grid::size, + &multi_grid::thread_rank, + &multi_grid::grid_rank, + &multi_grid::num_grids + }; + + return &mgf; + } +# endif + }; +#endif + +#if defined(_CG_HAS_CLUSTER_GROUP) + namespace cluster { + + _CG_STATIC_QUALIFIER bool isReal() + { + return __clusterDimIsSpecified(); + } + + _CG_STATIC_QUALIFIER void barrier_arrive() + { + __cluster_barrier_arrive(); + } + + _CG_STATIC_QUALIFIER void barrier_wait() + { + __cluster_barrier_wait(); + } + + _CG_STATIC_QUALIFIER void sync() + { + barrier_arrive(); + barrier_wait(); + } + + _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr) + { + return __cluster_query_shared_rank(addr); + } + + template + _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank) + { + return static_cast(__cluster_map_shared_rank(addr, rank)); + } + + _CG_STATIC_QUALIFIER dim3 block_index() + { + return __clusterRelativeBlockIdx(); + } + + _CG_STATIC_QUALIFIER unsigned int block_rank() + { + return __clusterRelativeBlockRank(); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() + { + const dim3 blockIndex = block_index(); + return dim3(blockIndex.x * blockDim.x + threadIdx.x, + blockIndex.y * blockDim.y + threadIdx.y, + blockIndex.z * blockDim.z + threadIdx.z); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return block_rank() * cta::num_threads() + cta::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() + { + return __clusterDim(); + } + + _CG_STATIC_QUALIFIER unsigned int num_blocks() + { + return __clusterSizeInBlocks(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + const dim3 dimBlocks = dim_blocks(); + const unsigned int x = dimBlocks.x * blockDim.x; + const unsigned int y = dimBlocks.y * blockDim.y; + const unsigned int z = dimBlocks.z * blockDim.z; + return dim3(x, y, z); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return num_blocks() * cta::num_threads(); + } + + }; +#endif + + _CG_STATIC_QUALIFIER unsigned int laneid() + { + unsigned int laneid; + asm ("mov.u32 %0, %%laneid;" : "=r"(laneid)); + return laneid; + } + + _CG_STATIC_QUALIFIER unsigned int lanemask32_eq() + { + unsigned int lanemask32_eq; + asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq)); + return (lanemask32_eq); + } + + _CG_STATIC_QUALIFIER unsigned int lanemask32_lt() + { + unsigned int lanemask32_lt; + asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt)); + return (lanemask32_lt); + } + + _CG_STATIC_QUALIFIER void abort() + { + _CG_ABORT(); + } + + template + _CG_QUALIFIER void assert_if_not_arithmetic() { +#ifdef _CG_CPP11_FEATURES + static_assert( + _CG_STL_NAMESPACE::is_integral::value || + details::is_float_or_half::value, + "Error: Ty is neither integer or float" + ); +#endif //_CG_CPP11_FEATURES + } + +#ifdef _CG_CPP11_FEATURES + _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) { + return x == 1 ? 0 : 1 + log2(x / 2); + } +#endif //_CG_CPP11_FEATURES + +}; // !Namespace internal + +_CG_END_NAMESPACE + +#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h new file mode 100644 index 0000000000000000000000000000000000000000..47cf260f3b4e0b29bf08c948697102bf027616db --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h @@ -0,0 +1,135 @@ +/* Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_MEMORY_H_ +# define _COOPERATIVE_GROUPS_MEMORY_H_ + +#include "info.h" + +_CG_BEGIN_NAMESPACE + +#if defined(_CG_CPP11_FEATURES) +namespace details { + _CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12; + +#if defined(_CG_HAS_RESERVED_SHARED) + _CG_STATIC_QUALIFIER void* reserved_shared_ptr() + { + void *ptr; + asm ("{\n\t" + " .reg .u32 start;\n\t" + " .reg .u64 extended;\n\t" + " mov.u32 start, %%reserved_smem_offset_1;\n\t" + " cvt.u64.u32 extended, start;\n\t" + " cvta.shared.u64 %0, extended;\n\t" + "}" + : "=" _CG_ASM_PTR_CONSTRAINT(ptr)); + return ptr; + } +#endif + + struct multi_warp_scratch { + // One barrier per possible size of the group. + _CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5; + _CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t); + + using communication_type = unsigned long long; + _CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type); + + // Layout of the scratch space: + barrier_t barriers[memory_barriers_count]; + char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use + communication_type communication_memory[default_max_block_size / 32]; + + _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) { + // One slot of collectives memory per warp. + return scratch_num_reserved_bytes + sync_memory_size + max_block_size / 32 * communication_size; + } + + _CG_QUALIFIER void init_barriers(unsigned int thread_rank) { + if (thread_rank < memory_barriers_count) { + barriers[thread_rank] = 0; + } + } + }; + +#if defined(_CG_HAS_RESERVED_SHARED) + // CG can expect at least 288 bytes available in reserved shared + static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large"); +#endif + + // Make sure the structure can fit into the user provided memory + static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size), + "multi-warp scratch size is too large"); + + + _CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) { + void *ptr; +#if defined(_CG_HAS_RESERVED_SHARED) + ptr = reserved_shared_ptr(); +#else + ptr = user_scratch; +#endif + return static_cast(ptr); + + } + +} + +template +struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory { +private: +#if !defined(_CG_HAS_RESERVED_SHARED) + char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)]; +#endif +}; +#endif + +_CG_END_NAMESPACE + +#endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */ diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..0313b52a23f440e283509993d6f7997ba5df2365 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h @@ -0,0 +1,419 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_REDUCE_H_ +#define _CG_REDUCE_H_ + +#include "info.h" +#include "helpers.h" +#include "coalesced_reduce.h" +#include "functional.h" +#include "cooperative_groups.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + + template + using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant< + bool, + _CG_STL_NAMESPACE::is_integral::value && (sizeof(Ty) <= 4)>; + + template + using redux_is_add_supported = _redux_is_add_supported; + + // A specialization for 64 bit logical operations is possible + // but for now only accelerate 32 bit bitwise ops + template + using redux_is_logical_supported = redux_is_add_supported; + + // Base operator support case + template struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {}; +#ifdef _CG_HAS_OP_REDUX + template struct _redux_op_supported, Ty> : public redux_is_add_supported {}; + template struct _redux_op_supported, Ty> : public redux_is_add_supported {}; + template struct _redux_op_supported, Ty> : public redux_is_add_supported {}; + template struct _redux_op_supported, Ty> : public redux_is_logical_supported {}; + template struct _redux_op_supported, Ty> : public redux_is_logical_supported {}; + template struct _redux_op_supported, Ty> : public redux_is_logical_supported {}; +#endif + + template class TyOp> + using redux_op_supported = _redux_op_supported< + typename details::remove_qual>, + Ty>; + + // Groups smaller than 16 actually have worse performance characteristics when used with redux + // tiles of size 16 and 32 perform the same or better and have better code generation profiles + template struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {}; + + template + struct _redux_group_optimized> : public _CG_STL_NAMESPACE::integral_constant< + bool, + (Sz >= 16)> {}; + template + struct _redux_group_optimized> : public _CG_STL_NAMESPACE::integral_constant< + bool, + (Sz >= 16)> {}; + template <> + struct _redux_group_optimized : public _CG_STL_NAMESPACE::true_type {}; + + template + using redux_group_optimized = _redux_group_optimized>; + + template