diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py b/.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf2f863b3d6d67ad997a25fb24e954c9966a1b5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py
@@ -0,0 +1,262 @@
+from triton.backends.compiler import BaseBackend, GPUTarget
+from triton._C.libtriton import ir, passes, llvm, amd
+from dataclasses import dataclass
+from typing import Any, Tuple
+import hashlib
+import tempfile
+import os
+import re
+import subprocess
+import functools
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class HIPOptions:
+    num_warps: int = 4
+    waves_per_eu: int = 1
+    num_stages: int = 0
+    num_ctas: int = 1
+    extern_libs: dict = None
+    cluster_dims: tuple = (1, 1, 1)
+    debug: bool = False
+    arch: str = None
+    allow_fp8e4nv: bool = False
+    allow_fp8e4b15: bool = False
+    default_dot_input_precision: str = "ieee"
+    allowed_dot_input_precisions: Tuple[str] = ("ieee", )
+    enable_fp_fusion: bool = True
+    matrix_instr_nonkdim: int = 0
+    kpack: int = 1
+    allow_flush_denorm: bool = False
+    max_num_imprecise_acc_default: int = 0
+    backend_name: str = 'hip'
+
+    def __post_init__(self):
+        default_libdir = Path(__file__).parent / 'lib'
+        extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
+        # Ignore user-defined warp size for gfx9
+        warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch else 64
+        object.__setattr__(self, 'warp_size', warp_size)
+        libs = ["ocml", "ockl"]
+        for lib in libs:
+            extern_libs[lib] = str(default_libdir / f'{lib}.bc')
+        object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
+        assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
+               "num_warps must be a power of 2"
+
+    def hash(self):
+        key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+
+class HIPBackend(BaseBackend):
+
+    @staticmethod
+    def supports_target(target: GPUTarget):
+        return target.backend == 'hip'
+
+    def __init__(self, target: GPUTarget) -> None:
+        super().__init__(target)
+        assert isinstance(target.arch, str)
+        self.binary_ext = "hsaco"
+
+    def parse_options(self, opts) -> Any:
+        args = {'arch': self.target.arch}
+        args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts})
+        return HIPOptions(**args)
+
+    def pack_metadata(self, metadata):
+        return (
+            metadata.num_warps,
+            metadata.num_ctas,
+            metadata.shared,
+            metadata.cluster_dims[0],
+            metadata.cluster_dims[1],
+            metadata.cluster_dims[2],
+        )
+
+    def get_codegen_implementation(self):
+        codegen_fns = dict()
+        return codegen_fns
+
+    def load_dialects(self, ctx):
+        amd.load_dialects(ctx)
+
+    @staticmethod
+    def path_to_rocm_lld():
+        # Check env path for ld.lld
+        lld_env_path = os.getenv("TRITON_HIP_LLD_PATH")
+        if lld_env_path is not None:
+            lld = Path(lld_env_path)
+            if lld.is_file():
+                return lld
+        # Check backend for ld.lld (used for pytorch wheels)
+        lld = Path(__file__).parent / "llvm/bin/ld.lld"
+        if lld.is_file():
+            return lld
+        lld = Path("/opt/rocm/llvm/bin/ld.lld")
+        if lld.is_file():
+            return lld
+        lld = Path("/usr/bin/ld.lld")
+        if lld.is_file():
+            return lld
+        raise Exception("ROCm linker /opt/rocm/llvm/bin/ld.lld not found")
+
+    @staticmethod
+    def make_ttir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.common.add_inliner(pm)
+        passes.ttir.add_rewrite_tensor_pointer(pm)
+        passes.ttir.add_combine(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_reorder_broadcast(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_licm(pm)
+        passes.common.add_symbol_dce(pm)
+        pm.run(mod)
+        return mod
+
+    @staticmethod
+    def make_ttgir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
+                                           options.num_ctas)
+        pm.run(mod)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.ttgpuir.add_coalesce(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_thread_locality(pm)
+        amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        amd.passes.ttgpuir.add_optimize_epilogue(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, True)
+        if options.num_stages == 0 and amd.has_matrix_core_feature(options.arch):
+            amd.passes.ttgpuir.add_stream_pipeline(pm)
+            passes.common.add_canonicalizer(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, True)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_reduce_data_duplication(pm)
+        if options.num_stages != 0:
+            amd.passes.ttgpuir.add_reorder_instructions(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        pm.run(mod)
+        return mod
+
+    @staticmethod
+    def make_llir(src, metadata, options):
+        mod = src
+        # TritonGPU -> LLVM-IR (MLIR)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch)
+        passes.convert.add_scf_to_cf(pm)
+        passes.convert.add_index_to_llvmir(pm)
+
+        passes.ttgpuir.add_allocate_shared_memory(pm)
+        ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
+        ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
+        ##    of the value of kernel arg `allow_flush_denorm`.
+        ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
+        ##    depends on the value of kernel arg `allow_flush_denorm`.
+        ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
+        ##    For now it is used as a controller for developers only.
+        __HIP_FTZ = True
+        amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+
+        passes.convert.add_cf_to_llvmir(pm)
+        passes.convert.add_arith_to_llvmir(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
+            passes.llvmir.add_di_scope(pm)
+        # This pass (`add_builtin_func_to_llvmir`) serves as a temporary workaround to address the issue of excessive basic block
+        # count caused by predicated loads/stores. In certain kernels, the addition of these blocks can cause the MLIR
+        # canonicalizer to never finish when attempting to merge blocks. The permanent solution under consideration
+        # involves using MUBUF instructions that have built-in out-of-bounds checks, which would eliminate the need
+        # for conditional branching around memory accesses.
+        amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm)
+        pm.run(mod)
+
+        # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
+        llvm.init_targets()
+        context = llvm.context()
+        llvm_mod = llvm.to_module(mod, context)
+
+        # Set various control constants on the LLVM module so that device
+        # libraries can resolve references to them.
+        amd.set_isa_version(llvm_mod, options.arch)
+        amd.set_abi_version(llvm_mod, 400)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
+
+        # Set kernel attributes first given this may affect later optimizations.
+        fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
+        # The public kernel should be kernel 0.
+        fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
+        fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
+        fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}")
+        denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
+        fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
+
+        if options.extern_libs:
+            paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
+            llvm.link_extern_libs(llvm_mod, paths)
+
+        llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, amd.TARGET_TRIPLE)
+
+        # Get some metadata
+        metadata["shared"] = src.get_int_attr("triton_gpu.shared")
+
+        amd.cleanup_bitcode_metadata(llvm_mod)
+        return str(llvm_mod)
+
+    @staticmethod
+    def make_amdgcn(src, metadata, options):
+        # Find kernel names (there should only be one)
+        # We get the name at the last possible step to accomodate `triton.compile`
+        # on user-provided LLVM
+        names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
+        assert len(names) == 1
+        metadata["name"] = names[0]
+        # llvm -> hsaco
+        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', [], options.enable_fp_fusion, False)
+        if os.environ.get("AMDGCN_ENABLE_DUMP", "0") == "1":
+            print("// -----// AMDGCN Dump //----- //")
+            print(amdgcn)
+        return amdgcn
+
+    @staticmethod
+    def make_hsaco(src, metadata, options):
+        hsaco = amd.assemble_amdgcn(src, options.arch, '')
+
+        rocm_path = HIPBackend.path_to_rocm_lld()
+        with tempfile.NamedTemporaryFile() as tmp_out:
+            with tempfile.NamedTemporaryFile() as tmp_in:
+                with open(tmp_in.name, 'wb') as fd_in:
+                    fd_in.write(hsaco)
+                subprocess.check_call([rocm_path, '-flavor', 'gnu', '-shared', tmp_in.name, '-o', tmp_out.name])
+            with open(tmp_out.name, 'rb') as fd_out:
+                ret = fd_out.read()
+        return ret
+
+    def add_stages(self, stages, options):
+        stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
+        stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
+        stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
+        stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
+        stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
+
+    @functools.lru_cache()
+    def hash(self):
+        version = subprocess.check_output([HIPBackend.path_to_rocm_lld(), "--version"], encoding='utf-8')
+        return f'{version}-{self.target}'
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c
new file mode 100644
index 0000000000000000000000000000000000000000..233613a55411162b0ff2b74e4b2157165c35e59f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c
@@ -0,0 +1,211 @@
+#define __HIP_PLATFORM_AMD__
+// clang-format off
+// hip_depreated.h needs definitions from hip_runtime.h.
+#include <hip/hip_runtime.h>
+#include <hip/hip_deprecated.h>
+// clang-format on
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// The list of paths to search for the HIP runtime library. The caller Python
+// code should substitute the search path placeholder.
+static const char *hipLibSearchPaths[] = {"/*py_libhip_search_path*/"};
+
+// The list of HIP dynamic library symbols and their signature we are interested
+// in this file.
+// |FOR_EACH_ERR_FN| is a macro to process APIs that return hipError_t;
+// |FOR_EACH_STR_FN| is a macro to process APIs that return const char *.
+//
+// HIP 6.0 introduced an updated hipGetDeviceProperties API under a new symbol,
+// hipGetDevicePropertiesR0600. However, the associated hipDeviceProp_t was
+// directly updated with breaking changes to match hipGetDevicePropertiesR0600
+// in the header file. We include the header file from HIP 6.0. So here if we
+// use hipGetDeviceProperties together with hipDeviceProp_t we will use the
+// old API with a new struct definition and mess up the interpretation.
+//
+// This is a known issue: https://github.com/ROCm/ROCm/issues/2728.
+//
+// For now explicitly defer to the old hipDeviceProp_t struct. This should work
+// for both 5.x and 6.x. In the long term we need to switch to use
+// hipGetProcAddress once available:
+// https://github.com/ROCm/clr/commit/0479cdb3dd30ef58718cad44e424bd793c394cc0
+#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                      \
+  FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError)                      \
+  FOR_EACH_ERR_FN(hipGetDeviceProperties, hipDeviceProp_tR0000 *prop,          \
+                  int deviceId)                                                \
+  FOR_EACH_ERR_FN(hipModuleLoadDataEx, hipModule_t *module, const void *image, \
+                  unsigned int numOptions, hipJitOption *options,              \
+                  void **optionValues)                                         \
+  FOR_EACH_ERR_FN(hipModuleGetFunction, hipFunction_t *function,               \
+                  hipModule_t module, const char *kname)                       \
+  FOR_EACH_ERR_FN(hipFuncGetAttribute, int *, hipFunction_attribute attr,      \
+                  hipFunction_t function)
+
+// The HIP symbol table for holding resolved dynamic library symbols.
+struct HIPSymbolTable {
+#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...)                              \
+  hipError_t (*hipSymbolName)(__VA_ARGS__);
+#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...)                              \
+  const char *(*hipSymbolName)(__VA_ARGS__);
+
+  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
+};
+
+static struct HIPSymbolTable hipSymbolTable;
+
+bool initSymbolTable() {
+  // Use the HIP runtime library loaded into the existing process if it exits.
+  void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
+  if (lib) {
+    // printf("[triton] chosen loaded libamdhip64.so in the process\n");
+  }
+
+  // Otherwise, go through the list of search paths to dlopen the first HIP
+  // driver library.
+  if (!lib) {
+    int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
+    for (int i = 0; i < n; ++i) {
+      void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
+      if (handle) {
+        lib = handle;
+        // printf("[triton] chosen %s\n", hipLibSearchPaths[i]);
+      }
+    }
+  }
+  if (!lib) {
+    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
+    return false;
+  }
+
+  // Resolve all symbols we are interested in.
+  dlerror(); // Clear existing errors
+  const char *error = NULL;
+#define QUERY_EACH_FN(hipSymbolName, ...)                                      \
+  *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName);        \
+  error = dlerror();                                                           \
+  if (error) {                                                                 \
+    PyErr_SetString(PyExc_RuntimeError,                                        \
+                    "cannot query " #hipSymbolName " from libamdhip64.so");    \
+    dlclose(lib);                                                              \
+    return false;                                                              \
+  }
+
+  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
+
+  return true;
+}
+
+static inline void gpuAssert(hipError_t code, const char *file, int line) {
+  {
+    if (code != HIP_SUCCESS) {
+      {
+        const char *prefix = "Triton Error [HIP]: ";
+        const char *str = hipSymbolTable.hipGetErrorString(code);
+        char err[1024] = {0};
+        snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str);
+        PyGILState_STATE gil_state;
+        gil_state = PyGILState_Ensure();
+        PyErr_SetString(PyExc_RuntimeError, err);
+        PyGILState_Release(gil_state);
+      }
+    }
+  }
+}
+
+#define HIP_CHECK(ans)                                                         \
+  {                                                                            \
+    gpuAssert((ans), __FILE__, __LINE__);                                      \
+    if (PyErr_Occurred())                                                      \
+      return NULL;                                                             \
+  }
+
+static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
+  int device_id;
+  if (!PyArg_ParseTuple(args, "i", &device_id))
+    return NULL;
+
+  hipDeviceProp_tR0000 props;
+  HIP_CHECK(hipSymbolTable.hipGetDeviceProperties(&props, device_id));
+
+  // create a struct to hold device properties
+  return Py_BuildValue(
+      "{s:i, s:i, s:i, s:i, s:i, s:i, s:s, s:i}", "max_shared_mem",
+      props.sharedMemPerBlock, "max_num_regs", props.regsPerBlock,
+      "multiprocessor_count", props.multiProcessorCount, "sm_clock_rate",
+      props.clockRate, "mem_clock_rate", props.memoryClockRate, "mem_bus_width",
+      props.memoryBusWidth, "arch", props.gcnArchName, "warpSize",
+      props.warpSize);
+}
+
+static PyObject *loadBinary(PyObject *self, PyObject *args) {
+  const char *name;
+  const char *data;
+  Py_ssize_t data_size;
+  int shared;
+  int device;
+  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
+                        &device)) {
+    return NULL;
+  }
+
+  // set HIP options
+  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
+                        hipJitOptionErrorLogBuffer,
+                        hipJitOptionInfoLogBufferSizeBytes,
+                        hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
+  const unsigned int errbufsize = 8192;
+  const unsigned int logbufsize = 8192;
+  char _err[errbufsize];
+  char _log[logbufsize];
+  void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
+                    (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
+
+  // launch HIP Binary
+  hipModule_t mod;
+  hipFunction_t fun;
+  HIP_CHECK(hipSymbolTable.hipModuleLoadDataEx(&mod, data, 5, opt, optval))
+  HIP_CHECK(hipSymbolTable.hipModuleGetFunction(&fun, mod, name));
+
+  // get allocated registers and spilled registers from the function
+  int n_regs = 0;
+  int n_spills = 0;
+  hipSymbolTable.hipFuncGetAttribute(&n_regs, HIP_FUNC_ATTRIBUTE_NUM_REGS, fun);
+  hipSymbolTable.hipFuncGetAttribute(&n_spills,
+                                     HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
+  n_spills /= 4;
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs,
+                       n_spills);
+}
+
+static PyMethodDef ModuleMethods[] = {
+    {"load_binary", loadBinary, METH_VARARGS,
+     "Load provided hsaco into HIP driver"},
+    {"get_device_properties", getDeviceProperties, METH_VARARGS,
+     "Get the properties for a given device"},
+    {NULL, NULL, 0, NULL} // sentinel
+};
+
+static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils",
+                                       NULL, // documentation
+                                       -1,   // size
+                                       ModuleMethods};
+
+PyMODINIT_FUNC PyInit_hip_utils(void) {
+  if (!initSymbolTable()) {
+    return NULL;
+  }
+
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if (m == NULL) {
+    return NULL;
+  }
+  PyModule_AddFunctions(m, ModuleMethods);
+
+  return m;
+}
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1ff6e1d65f3fbbdf824b93ad17f80164bc96626
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py
@@ -0,0 +1,497 @@
+import functools
+import os
+import hashlib
+import subprocess
+import tempfile
+from pathlib import Path
+from triton.runtime.build import _build
+from triton.runtime.cache import get_cache_manager
+from triton.backends.compiler import GPUTarget
+from triton.backends.driver import GPUDriver
+
+dirname = os.path.dirname(os.path.realpath(__file__))
+include_dir = [os.path.join(dirname, "include")]
+
+
+def _find_already_mmapped_dylib_on_linux(lib_name):
+    import platform
+    if platform.system() != 'Linux':
+        return None
+
+    # Use dl_iterate_phdr to walk through the list of shared libraries at runtime.
+    # See https://www.man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html for details.
+
+    import ctypes
+    from ctypes import c_char, c_int, c_size_t, c_void_p, c_char_p, POINTER
+
+    class DlPhdrInfo(ctypes.Structure):
+        _fields_ = [
+            ('dlpi_addr', c_void_p),
+            ('dlpi_name', c_char_p),
+            # We don't care about the remaining fields.
+        ]
+
+    # callback_t must use POINTER(c_char) to avoid copying.
+    callback_t = ctypes.CFUNCTYPE(c_int, POINTER(DlPhdrInfo), POINTER(c_size_t), POINTER(c_char))
+
+    # Load libc and get the dl_iterate_phdr symbol.
+    try:
+        dl_iterate_phdr = ctypes.CDLL('libc.so.6').dl_iterate_phdr
+    except:
+        return None
+    # argtypes must use c_char_p to accept create_string_buffer.
+    dl_iterate_phdr.argtypes = [callback_t, c_char_p]
+    dl_iterate_phdr.restype = c_int
+
+    max_path_length = 4096
+    path = ctypes.create_string_buffer(max_path_length + 1)
+
+    # Define callback to get the loaded dylib path.
+    def callback(info, size, data):
+        dlpi_name = info.contents.dlpi_name
+        p = Path(os.fsdecode(dlpi_name))
+        if lib_name in p.name:
+            # Found the dylib; get its path.
+            ctypes.memmove(data, dlpi_name, min(max_path_length, len(dlpi_name)))
+            return 1
+        return 0
+
+    if dl_iterate_phdr(callback_t(callback), path):
+        return os.fsdecode(ctypes.string_at(path))
+    return None
+
+
+@functools.lru_cache()
+def _get_path_to_hip_runtime_dylib():
+    lib_name = "libamdhip64.so"
+
+    # If we are told explicitly what HIP runtime dynamic library to use, obey that.
+    env_libhip_path = os.getenv("TRITON_LIBHIP_PATH")
+    if env_libhip_path:
+        if env_libhip_path.endswith(lib_name) and os.path.exists(env_libhip_path):
+            return env_libhip_path
+        raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}")
+
+    # If the shared object is already mmapped to address space, use it.
+    mmapped_path = _find_already_mmapped_dylib_on_linux(lib_name)
+    if mmapped_path:
+        if os.path.exists(mmapped_path):
+            return mmapped_path
+        raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}")
+
+    paths = []
+
+    import site
+    # First search the HIP runtime dynamic library packaged with PyTorch. It's very likely
+    # that we run Triton together with PyTorch. This makes sure we use the same dynamic
+    # library to avoid version mismatch.
+    site_packages = site.getsitepackages()
+    user_site = site.getusersitepackages()
+    if site.ENABLE_USER_SITE:  # ENABLE_USER_SITE is initialized in getusersitepackages()
+        site_packages = [user_site] + site_packages
+    for path in site_packages:
+        path = os.path.join(path, "torch", "lib", lib_name)
+        if os.path.exists(path):
+            return path
+        paths.append(path)
+
+    # Then try to see if developer provides a HIP runtime dynamic library using LD_LIBARAY_PATH.
+    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    if env_ld_library_path:
+        for d in env_ld_library_path.split(":"):
+            f = os.path.join(d, lib_name)
+            if os.path.exists(f):
+                return f
+            paths.append(f)
+
+    # Afterwards try to search the loader dynamic library resolution paths.
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+    # each line looks like the following:
+    # libamdhip64.so.6 (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so.6
+    # libamdhip64.so (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so
+    locs = [line.split()[-1] for line in libs.splitlines() if line.strip().endswith(lib_name)]
+    for loc in locs:
+        if os.path.exists(loc):
+            return loc
+        paths.append(loc)
+
+    # As a last resort, guess if we have it in some common installation path.
+    common_install_path = os.path.join('/opt/rocm/lib/', lib_name)
+    if os.path.exists(common_install_path):
+        return common_install_path
+    paths.append(common_install_path)
+
+    raise RuntimeError(f"cannot locate {lib_name} after attempted paths {paths}")
+
+
+def compile_module_from_src(src, name):
+    key = hashlib.sha256(src.encode("utf-8")).hexdigest()
+    cache = get_cache_manager(key)
+    cache_path = cache.get_file(f"{name}.so")
+    if cache_path is None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_path = os.path.join(tmpdir, "main.c")
+            with open(src_path, "w") as f:
+                f.write(src)
+            so = _build(name, src_path, tmpdir, [], include_dir, [])
+            with open(so, "rb") as f:
+                cache_path = cache.put(f.read(), f"{name}.so", binary=True)
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(name, cache_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+class HIPUtils(object):
+
+    def __new__(cls):
+        if not hasattr(cls, "instance"):
+            cls.instance = super(HIPUtils, cls).__new__(cls)
+        return cls.instance
+
+    def __init__(self):
+        libhip_path = _get_path_to_hip_runtime_dylib()
+        src = Path(os.path.join(dirname, "driver.c")).read_text()
+        # Just do a simple search and replace here instead of templates or format strings.
+        # This way we don't need to escape-quote C code curly brackets and we can replace
+        # exactly once.
+        src = src.replace('/*py_libhip_search_path*/', libhip_path, 1)
+        mod = compile_module_from_src(src, "hip_utils")
+        self.load_binary = mod.load_binary
+        self.get_device_properties = mod.get_device_properties
+
+
+# -------------------- Launcher ----------------------------
+def ty_to_cpp(ty):
+    if ty[0] == '*':
+        return "hipDeviceptr_t"
+    return {
+        "i1": "int32_t",
+        "i8": "int8_t",
+        "i16": "int16_t",
+        "i32": "int32_t",
+        "i64": "int64_t",
+        "u1": "uint32_t",
+        "u8": "uint8_t",
+        "u16": "uint16_t",
+        "u32": "uint32_t",
+        "u64": "uint64_t",
+        "fp16": "float",
+        "bf16": "float",
+        "fp32": "float",
+        "f32": "float",
+        "fp64": "double",
+    }[ty]
+
+
+def make_launcher(constants, signature, ids, warp_size):
+    start_desc = len(signature)
+    #signature = generate_cu_signature(constants, signature, ids)
+    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
+
+    def _extracted_type(ty):
+        if ty[0] == '*':
+            return "PyObject*"
+        return {
+            'i1': 'int32_t',
+            'i8': 'int8_t',
+            'i16': 'int16_t',
+            'i32': 'int32_t',
+            'i64': 'int64_t',
+            'u1': 'uint32_t',
+            'u8': 'uint8_t',
+            'u16': 'uint16_t',
+            'u32': 'uint32_t',
+            'u64': 'uint64_t',
+            'fp16': 'float',
+            'bf16': 'float',
+            'fp32': 'float',
+            'f32': 'float',
+            'fp64': 'double',
+        }[ty]
+
+    def format_of(ty):
+        return {
+            "PyObject*": "O",
+            "float": "f",
+            "double": "d",
+            "long": "l",
+            "int8_t": "b",
+            "int16_t": "h",
+            "int32_t": "i",
+            "int64_t": "l",
+            "uint8_t": "B",
+            "uint16_t": "H",
+            "uint32_t": "I",
+            "uint64_t": "K",
+        }[ty]
+
+    args_format = ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])
+    format = "iiiKKOOOO" + args_format
+    args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
+
+    libhip_path = _get_path_to_hip_runtime_dylib()
+
+    # generate glue code
+    params = [i for i in signature.keys() if i not in constants]
+    src = f"""
+#define __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#include <Python.h>
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <dlfcn.h>
+
+// The list of paths to search for the HIP runtime library. The caller Python
+// code should substitute the search path placeholder.
+static const char *hipLibSearchPaths[] = {{"{libhip_path}"}};
+
+// The list of HIP dynamic library symbols and their signature we are interested
+// in this file.
+#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                     \\
+  FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError)                     \\
+  FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f,                     \\
+                  unsigned int gridDimX, unsigned int gridDimY,               \\
+                  unsigned int gridDimZ, unsigned int blockDimX,              \\
+                  unsigned int blockDimY, unsigned int blockDimZ,             \\
+                  unsigned int sharedMemBytes, hipStream_t stream,            \\
+                  void **kernelParams, void **extra)                          \\
+  FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data,                         \\
+                  hipPointer_attribute attribute, hipDeviceptr_t ptr)
+
+// The HIP symbol table for holding resolved dynamic library symbols.
+struct HIPSymbolTable {{
+#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...)                             \\
+  hipError_t (*hipSymbolName)(__VA_ARGS__);
+#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...)                             \\
+  const char *(*hipSymbolName)(__VA_ARGS__);
+
+  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
+}};
+
+static struct HIPSymbolTable hipSymbolTable;
+
+bool initSymbolTable() {{
+  // Use the HIP runtime library loaded into the existing process if it exits.
+  void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
+  if (lib) {{
+    // printf("[triton] chosen loaded libamdhip64.so in the process\\n");
+  }}
+
+  // Otherwise, go through the list of search paths to dlopen the first HIP
+  // driver library.
+  if (!lib) {{
+    int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
+    for (int i = 0; i < n; ++i) {{
+      void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
+      if (handle) {{
+        lib = handle;
+        // printf("[triton] chosen %s\\n", hipLibSearchPaths[i]);
+      }}
+    }}
+  }}
+  if (!lib) {{
+    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
+    return false;
+  }}
+
+  // Resolve all symbols we are interested in.
+  dlerror(); // Clear existing errors
+  const char *error = NULL;
+#define QUERY_EACH_FN(hipSymbolName, ...)                                     \\
+  *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName);       \\
+  error = dlerror();                                                          \\
+  if (error) {{                                                               \\
+    PyErr_SetString(PyExc_RuntimeError,                                       \\
+                    "cannot query " #hipSymbolName " from libamdhip64.so");   \\
+    dlclose(lib);                                                             \\
+    return false;                                                             \\
+  }}
+
+  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
+
+  return true;
+}}
+
+static inline void gpuAssert(hipError_t code, const char *file, int line)
+{{
+   if (code != HIP_SUCCESS)
+   {{
+      const char* prefix = "Triton Error [HIP]: ";
+       const char* str = hipSymbolTable.hipGetErrorString(code);
+      char err[1024] = {{0}};
+      snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
+      PyErr_SetString(PyExc_RuntimeError, err);
+   }}
+}}
+
+#define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
+
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+  // printf("_launch hip kernel\\n");
+  void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};
+  if (gridX*gridY*gridZ > 0) {{
+      HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
+    }}
+  }}
+
+typedef struct _DevicePtrInfo {{
+    hipDeviceptr_t dev_ptr;
+    bool valid;
+}} DevicePtrInfo;
+
+static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
+  DevicePtrInfo ptr_info;
+  ptr_info.dev_ptr = 0;
+  ptr_info.valid = true;
+  if (PyLong_Check(obj)) {{
+    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
+    return ptr_info;
+  }}
+  if (obj == Py_None) {{
+    // valid nullptr
+    return ptr_info;
+  }}
+  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
+  if(ptr){{
+    PyObject *empty_tuple = PyTuple_New(0);
+    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
+    Py_DECREF(empty_tuple);
+    Py_DECREF(ptr);
+    if (!PyLong_Check(ret)) {{
+      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
+      ptr_info.valid = false;
+      return ptr_info;
+    }}
+    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
+    if(!ptr_info.dev_ptr)
+      return ptr_info;
+    uint64_t dev_ptr;
+    hipError_t status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
+    if (status == hipErrorInvalidValue) {{
+        PyErr_Format(PyExc_ValueError,
+                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
+        ptr_info.valid = false;
+    }}
+    ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
+    Py_DECREF(ret);
+    return ptr_info;
+  }}
+  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
+  return ptr_info;
+}}
+
+static PyObject* launch(PyObject* self, PyObject* args) {{
+   // printf("launch\\n");
+  int gridX, gridY, gridZ;
+  uint64_t _stream;
+  uint64_t _function;
+  PyObject *launch_enter_hook = NULL;
+  PyObject *launch_exit_hook = NULL;
+  PyObject *kernel_metadata = NULL;
+  PyObject *launch_metadata = NULL;
+  {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
+  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function,
+                                           &kernel_metadata, &launch_metadata,
+                                           &launch_enter_hook, &launch_exit_hook {args_list})) {{
+    return NULL;
+  }}
+
+  // extract kernel metadata
+  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
+  if (!PyArg_ParseTuple(kernel_metadata, \"iiiiii\", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {{
+    return NULL;
+  }}
+  // extract launch metadata
+  if (launch_enter_hook != Py_None){{
+    PyObject* args = Py_BuildValue("(O)", launch_metadata);
+    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
+    Py_DECREF(args);
+    if (!ret)
+      return NULL;
+  }}
+
+
+  // raise exception asap
+  {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function{', ' + ', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items()) if len(signature) > 0 else ''});
+
+  if(launch_exit_hook != Py_None){{
+    PyObject* args = Py_BuildValue("(O)", launch_metadata);
+    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
+    Py_DECREF(args);
+    if (!ret)
+      return NULL;
+  }}
+
+  if(PyErr_Occurred()) {{
+    return NULL;
+  }}
+  // return None
+  Py_INCREF(Py_None);
+  return Py_None;
+}}
+
+static PyMethodDef ModuleMethods[] = {{
+  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
+  {{NULL, NULL, 0, NULL}} // sentinel
+}};
+
+static struct PyModuleDef ModuleDef = {{
+  PyModuleDef_HEAD_INIT,
+  \"__triton_launcher\",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+}};
+
+PyMODINIT_FUNC PyInit___triton_launcher(void) {{
+  if (!initSymbolTable()) {{
+    return NULL;
+  }}
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {{
+    return NULL;
+  }}
+  PyModule_AddFunctions(m, ModuleMethods);
+  return m;
+}}
+"""
+    return src
+
+
+class HIPLauncher(object):
+
+    def __init__(self, src, metadata):
+        ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
+        constants = src.constants if hasattr(src, "constants") else dict()
+        cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
+        constants = {cst_key(key): value for key, value in constants.items()}
+        signature = {cst_key(key): value for key, value in src.signature.items()}
+        src = make_launcher(constants, signature, ids, metadata.warp_size)
+        mod = compile_module_from_src(src, "__triton_launcher")
+        self.launch = mod.launch
+
+    def __call__(self, *args, **kwargs):
+        self.launch(*args, **kwargs)
+
+
+class HIPDriver(GPUDriver):
+
+    def __init__(self):
+        super().__init__()
+        self.utils = HIPUtils()
+        self.launcher_cls = HIPLauncher
+
+    @staticmethod
+    def is_active():
+        import torch
+        return torch.version.hip is not None
+
+    def get_current_target(self):
+        device = self.get_current_device()
+        device_properties = self.utils.get_device_properties(device)
+        arch = device_properties['arch']
+        warp_size = device_properties['warpSize']
+        return GPUTarget("hip", arch.split(':')[0], warp_size)
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..20dd9703df4e1027ab30bbe12b320aa3178ea3eb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//! HIP = Heterogeneous-compute Interface for Portability
+//!
+//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
+//! through either AMD CLANG or NVCC.   Key features tend to be in the spirit
+//! and terminology of CUDA, but with a portable path to other accelerators as well:
+//
+//! Both paths support rich C++ features including classes, templates, lambdas, etc.
+//! Runtime API is C
+//! Memory management is based on pure pointers and resembles malloc/free/copy.
+//
+//! hip_runtime.h     : includes everything in hip_api.h, plus math builtins and kernel launch
+//! macros. hip_runtime_api.h : Defines HIP API.  This is a C header file and does not use any C++
+//! features.
+
+#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_HIP_RUNTIME_H
+
+#if __HIP_DEVICE_COMPILE__ && !__GFX7__ && !__GFX8__ && !__GFX9__ && __AMDGCN_WAVEFRONT_SIZE == 64
+#error HIP is not supported on the specified GPU ARCH with wavefront size 64
+#endif
+
+#if !defined(__HIPCC_RTC__)
+// Some standard header files, these are included by hc.hpp and so want to make them avail on both
+// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
+// on NVCC path:
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#if __cplusplus > 199711L
+#include <thread>
+#endif
+#endif // !defined(__HIPCC_RTC__)
+
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+#include <hip/amd_detail/amd_hip_runtime.h>
+#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include <hip/nvidia_detail/nvidia_hip_runtime.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_runtime_api.h>
+#include <hip/library_types.h>
+#endif // !defined(__HIPCC_RTC__)
+#include <hip/hip_vector_types.h>
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..3473c8188637995720d20e614e85d06d8bdf7d59
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h
@@ -0,0 +1,194 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_common.h>
+#endif
+
+#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include "texture_types.h"
+#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+/*******************************************************************************
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/
+#if !defined(__HIPCC_RTC__)
+#include <limits.h>
+#include <hip/channel_descriptor.h>
+#include <hip/driver_types.h>
+#endif // !defined(__HIPCC_RTC__)
+
+#define hipTextureType1D 0x01
+#define hipTextureType2D 0x02
+#define hipTextureType3D 0x03
+#define hipTextureTypeCubemap 0x0C
+#define hipTextureType1DLayered 0xF1
+#define hipTextureType2DLayered 0xF2
+#define hipTextureTypeCubemapLayered 0xFC
+
+/**
+ * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
+ */
+#define HIP_IMAGE_OBJECT_SIZE_DWORD 12
+#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
+#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
+#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
+
+/**
+ * An opaque value that represents a hip texture object
+ */
+struct __hip_texture;
+typedef struct __hip_texture* hipTextureObject_t;
+
+/**
+ * hip texture address modes
+ */
+enum hipTextureAddressMode {
+    hipAddressModeWrap = 0,
+    hipAddressModeClamp = 1,
+    hipAddressModeMirror = 2,
+    hipAddressModeBorder = 3
+};
+
+/**
+ * hip texture filter modes
+ */
+enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
+
+/**
+ * hip texture read modes
+ */
+enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
+
+/**
+ * hip texture reference
+ */
+typedef struct textureReference {
+    int normalized;
+    enum hipTextureReadMode readMode;// used only for driver API's
+    enum hipTextureFilterMode filterMode;
+    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+    struct hipChannelFormatDesc channelDesc;
+    int sRGB;                    // Perform sRGB->linear conversion during texture read
+    unsigned int maxAnisotropy;  // Limit to the anisotropy ratio
+    enum hipTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+
+    hipTextureObject_t textureObject;
+    int numChannels;
+    enum hipArray_Format format;
+}textureReference;
+
+/**
+ * hip texture descriptor
+ */
+typedef struct hipTextureDesc {
+    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+    enum hipTextureFilterMode filterMode;
+    enum hipTextureReadMode readMode;
+    int sRGB;  // Perform sRGB->linear conversion during texture read
+    float borderColor[4];
+    int normalizedCoords;
+    unsigned int maxAnisotropy;
+    enum hipTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+}hipTextureDesc;
+
+#if __cplusplus
+
+/*******************************************************************************
+ *                                                                              *
+ *                                                                              *
+ *                                                                              *
+ *******************************************************************************/
+#if __HIP__
+#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
+#else
+#define __HIP_TEXTURE_ATTRIB
+#endif
+
+typedef textureReference* hipTexRef;
+
+template <class T, int texType = hipTextureType1D,
+          enum hipTextureReadMode mode = hipReadModeElementType>
+struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
+    texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
+            enum hipTextureAddressMode aMode = hipAddressModeClamp) {
+        normalized = norm;
+        readMode = mode;
+        filterMode = fMode;
+        addressMode[0] = aMode;
+        addressMode[1] = aMode;
+        addressMode[2] = aMode;
+        channelDesc = hipCreateChannelDesc<T>();
+        sRGB = 0;
+        textureObject = nullptr;
+        maxAnisotropy = 0;
+        mipmapLevelBias = 0;
+        minMipmapLevelClamp = 0;
+        maxMipmapLevelClamp = 0;
+    }
+
+    texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
+            struct hipChannelFormatDesc desc) {
+        normalized = norm;
+        readMode = mode;
+        filterMode = fMode;
+        addressMode[0] = aMode;
+        addressMode[1] = aMode;
+        addressMode[2] = aMode;
+        channelDesc = desc;
+        sRGB = 0;
+        textureObject = nullptr;
+        maxAnisotropy = 0;
+        mipmapLevelBias = 0;
+        minMipmapLevelClamp = 0;
+        maxMipmapLevelClamp = 0;
+    }
+};
+
+#endif /* __cplusplus */
+
+#else
+#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b694de5b421ecc7f647562b68258a0743dfab943
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..217f8ae237a5e5cd6e954b395d7c3461a6e34529
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c823afef7031ed45d44b498c813633387ed5cad5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea50da7beb2187e77f7606dd70faed0e4b4add
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cuda_stdint.h>
+
+#if !defined(_CUPTI_OPENACC_H_)
+#define _CUPTI_OPENACC_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize OpenACC support
+ *
+ * \param profRegister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profUnregister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profLookup function of type acc_prof_lookup as obtained from acc_register_library
+ */
+CUptiResult CUPTIAPI
+cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_OPENACC_H_*/
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h
new file mode 100644
index 0000000000000000000000000000000000000000..303dd42878fb02774d872c197ccc27b17f2af69e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cuda_stdint.h>
+#include "Openmp/omp-tools.h"
+
+#if !defined(_CUPTI_OPENMP_H_)
+#define _CUPTI_OPENMP_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
+
+/**
+ * \brief Initialize OPENMP support
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_OPENMP_H_*/
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..276967d07e8f8c0f7686e5b3b15151edf2415ae7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h
@@ -0,0 +1,1083 @@
+/*
+ * include/50/omp-tools.h.var
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <stddef.h>
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_task_memory)            \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+    macro (ompt_finalize_tool)              \
+                                            \
+    macro(ompt_get_num_procs)               \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
+
+#define FOREACH_OMPT_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (ompt_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (ompt_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (ompt_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (ompt_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (ompt_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (ompt_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (ompt_state_wait_mutex, 0x040)                                                         \
+    macro (ompt_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (ompt_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (ompt_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (ompt_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (ompt_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (ompt_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (ompt_state_idle, 0x100)           /* waiting for work */                              \
+    macro (ompt_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+
+
+#define FOREACH_KMP_MUTEX_IMPL(macro)                                                \
+    macro (kmp_mutex_impl_none, 0)         /* unknown implementation */              \
+    macro (kmp_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+                                                                                                                         \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
+    macro (ompt_callback_thread_end,        ompt_callback_thread_end_t,         2) /* thread end                      */ \
+                                                                                                                         \
+    macro (ompt_callback_parallel_begin,    ompt_callback_parallel_begin_t,     3) /* parallel begin                  */ \
+    macro (ompt_callback_parallel_end,      ompt_callback_parallel_end_t,       4) /* parallel end                    */ \
+                                                                                                                         \
+    macro (ompt_callback_task_create,       ompt_callback_task_create_t,        5) /* task begin                      */ \
+    macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
+    macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
+                                                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+                                                                                                                         \
+    macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
+                                                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */ \
+                                                                                                                         \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_released,    ompt_callback_mutex_t,             17) /* mutex released                  */ \
+                                                                                                                         \
+    macro (ompt_callback_dependences,       ompt_callback_dependences_t,       18) /* report task dependences         */ \
+    macro (ompt_callback_task_dependence,   ompt_callback_task_dependence_t,   19) /* report task dependence          */ \
+                                                                                                                         \
+    macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
+                                                                                                                         \
+    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+                                                                                                                         \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
+                                                                                                                         \
+    macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
+                                                                                                                         \
+    macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
+    macro (ompt_callback_lock_destroy,      ompt_callback_mutex_t,             25) /* lock destroy                    */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_acquire,     ompt_callback_mutex_acquire_t,     26) /* mutex acquire                   */ \
+    macro (ompt_callback_mutex_acquired,    ompt_callback_mutex_t,             27) /* mutex acquired                  */ \
+                                                                                                                         \
+    macro (ompt_callback_nest_lock,         ompt_callback_nest_lock_t,         28) /* nest lock                       */ \
+                                                                                                                         \
+    macro (ompt_callback_flush,             ompt_callback_flush_t,             29) /* after executing flush           */ \
+                                                                                                                         \
+    macro (ompt_callback_cancel,            ompt_callback_cancel_t,            30) /* cancel innermost binding region */ \
+                                                                                                                         \
+    macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
+                                                                                                                         \
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */
+
+/*****************************************************************************
+ * implementation specific types
+ *****************************************************************************/
+
+typedef enum kmp_mutex_impl_t {
+#define kmp_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+} kmp_mutex_impl_t;
+
+/*****************************************************************************
+ * definitions generated from spec
+ *****************************************************************************/
+
+typedef enum ompt_callbacks_t {
+  ompt_callback_thread_begin             = 1,
+  ompt_callback_thread_end               = 2,
+  ompt_callback_parallel_begin           = 3,
+  ompt_callback_parallel_end             = 4,
+  ompt_callback_task_create              = 5,
+  ompt_callback_task_schedule            = 6,
+  ompt_callback_implicit_task            = 7,
+  ompt_callback_target                   = 8,
+  ompt_callback_target_data_op           = 9,
+  ompt_callback_target_submit            = 10,
+  ompt_callback_control_tool             = 11,
+  ompt_callback_device_initialize        = 12,
+  ompt_callback_device_finalize          = 13,
+  ompt_callback_device_load              = 14,
+  ompt_callback_device_unload            = 15,
+  ompt_callback_sync_region_wait         = 16,
+  ompt_callback_mutex_released           = 17,
+  ompt_callback_dependences              = 18,
+  ompt_callback_task_dependence          = 19,
+  ompt_callback_work                     = 20,
+  ompt_callback_master                   = 21,
+  ompt_callback_target_map               = 22,
+  ompt_callback_sync_region              = 23,
+  ompt_callback_lock_init                = 24,
+  ompt_callback_lock_destroy             = 25,
+  ompt_callback_mutex_acquire            = 26,
+  ompt_callback_mutex_acquired           = 27,
+  ompt_callback_nest_lock                = 28,
+  ompt_callback_flush                    = 29,
+  ompt_callback_cancel                   = 30,
+  ompt_callback_reduction                = 31,
+  ompt_callback_dispatch                 = 32
+} ompt_callbacks_t;
+
+typedef enum ompt_record_t {
+  ompt_record_ompt               = 1,
+  ompt_record_native             = 2,
+  ompt_record_invalid            = 3
+} ompt_record_t;
+
+typedef enum ompt_record_native_t {
+  ompt_record_native_info  = 1,
+  ompt_record_native_event = 2
+} ompt_record_native_t;
+
+typedef enum ompt_set_result_t {
+  ompt_set_error            = 0,
+  ompt_set_never            = 1,
+  ompt_set_impossible       = 2,
+  ompt_set_sometimes        = 3,
+  ompt_set_sometimes_paired = 4,
+  ompt_set_always           = 5
+} ompt_set_result_t;
+
+typedef uint64_t ompt_id_t;
+
+typedef uint64_t ompt_device_time_t;
+
+typedef uint64_t ompt_buffer_cursor_t;
+
+typedef enum ompt_thread_t {
+  ompt_thread_initial                 = 1,
+  ompt_thread_worker                  = 2,
+  ompt_thread_other                   = 3,
+  ompt_thread_unknown                 = 4
+} ompt_thread_t;
+
+typedef enum ompt_scope_endpoint_t {
+  ompt_scope_begin                    = 1,
+  ompt_scope_end                      = 2
+} ompt_scope_endpoint_t;
+
+typedef enum ompt_dispatch_t {
+  ompt_dispatch_iteration             = 1,
+  ompt_dispatch_section               = 2
+} ompt_dispatch_t;
+
+typedef enum ompt_sync_region_t {
+  ompt_sync_region_barrier                = 1,
+  ompt_sync_region_barrier_implicit       = 2,
+  ompt_sync_region_barrier_explicit       = 3,
+  ompt_sync_region_barrier_implementation = 4,
+  ompt_sync_region_taskwait               = 5,
+  ompt_sync_region_taskgroup              = 6,
+  ompt_sync_region_reduction              = 7
+} ompt_sync_region_t;
+
+typedef enum ompt_target_data_op_t {
+  ompt_target_data_alloc                = 1,
+  ompt_target_data_transfer_to_device   = 2,
+  ompt_target_data_transfer_from_device = 3,
+  ompt_target_data_delete               = 4,
+  ompt_target_data_associate            = 5,
+  ompt_target_data_disassociate         = 6
+} ompt_target_data_op_t;
+
+typedef enum ompt_work_t {
+  ompt_work_loop               = 1,
+  ompt_work_sections           = 2,
+  ompt_work_single_executor    = 3,
+  ompt_work_single_other       = 4,
+  ompt_work_workshare          = 5,
+  ompt_work_distribute         = 6,
+  ompt_work_taskloop           = 7
+} ompt_work_t;
+
+typedef enum ompt_mutex_t {
+  ompt_mutex_lock                     = 1,
+  ompt_mutex_test_lock                = 2,
+  ompt_mutex_nest_lock                = 3,
+  ompt_mutex_test_nest_lock           = 4,
+  ompt_mutex_critical                 = 5,
+  ompt_mutex_atomic                   = 6,
+  ompt_mutex_ordered                  = 7
+} ompt_mutex_t;
+
+typedef enum ompt_native_mon_flag_t {
+  ompt_native_data_motion_explicit    = 0x01,
+  ompt_native_data_motion_implicit    = 0x02,
+  ompt_native_kernel_invocation       = 0x04,
+  ompt_native_kernel_execution        = 0x08,
+  ompt_native_driver                  = 0x10,
+  ompt_native_runtime                 = 0x20,
+  ompt_native_overhead                = 0x40,
+  ompt_native_idleness                = 0x80
+} ompt_native_mon_flag_t;
+
+typedef enum ompt_task_flag_t {
+  ompt_task_initial                   = 0x00000001,
+  ompt_task_implicit                  = 0x00000002,
+  ompt_task_explicit                  = 0x00000004,
+  ompt_task_target                    = 0x00000008,
+  ompt_task_undeferred                = 0x08000000,
+  ompt_task_untied                    = 0x10000000,
+  ompt_task_final                     = 0x20000000,
+  ompt_task_mergeable                 = 0x40000000,
+  ompt_task_merged                    = 0x80000000
+} ompt_task_flag_t;
+
+typedef enum ompt_task_status_t {
+  ompt_task_complete      = 1,
+  ompt_task_yield         = 2,
+  ompt_task_cancel        = 3,
+  ompt_task_detach        = 4,
+  ompt_task_early_fulfill = 5,
+  ompt_task_late_fulfill  = 6,
+  ompt_task_switch        = 7
+} ompt_task_status_t;
+
+typedef enum ompt_target_t {
+  ompt_target                         = 1,
+  ompt_target_enter_data              = 2,
+  ompt_target_exit_data               = 3,
+  ompt_target_update                  = 4
+} ompt_target_t;
+
+typedef enum ompt_parallel_flag_t {
+  ompt_parallel_invoker_program = 0x00000001,
+  ompt_parallel_invoker_runtime = 0x00000002,
+  ompt_parallel_league          = 0x40000000,
+  ompt_parallel_team            = 0x80000000
+} ompt_parallel_flag_t;
+
+typedef enum ompt_target_map_flag_t {
+  ompt_target_map_flag_to             = 0x01,
+  ompt_target_map_flag_from           = 0x02,
+  ompt_target_map_flag_alloc          = 0x04,
+  ompt_target_map_flag_release        = 0x08,
+  ompt_target_map_flag_delete         = 0x10,
+  ompt_target_map_flag_implicit       = 0x20
+} ompt_target_map_flag_t;
+
+typedef enum ompt_dependence_type_t {
+  ompt_dependence_type_in              = 1,
+  ompt_dependence_type_out             = 2,
+  ompt_dependence_type_inout           = 3,
+  ompt_dependence_type_mutexinoutset   = 4,
+  ompt_dependence_type_source          = 5,
+  ompt_dependence_type_sink            = 6
+} ompt_dependence_type_t;
+
+typedef enum ompt_cancel_flag_t {
+  ompt_cancel_parallel       = 0x01,
+  ompt_cancel_sections       = 0x02,
+  ompt_cancel_loop           = 0x04,
+  ompt_cancel_taskgroup      = 0x08,
+  ompt_cancel_activated      = 0x10,
+  ompt_cancel_detected       = 0x20,
+  ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+
+typedef uint64_t ompt_hwid_t;
+
+typedef uint64_t ompt_wait_id_t;
+
+typedef enum ompt_frame_flag_t {
+  ompt_frame_runtime        = 0x00,
+  ompt_frame_application    = 0x01,
+  ompt_frame_cfa            = 0x10,
+  ompt_frame_framepointer   = 0x20,
+  ompt_frame_stackaddress   = 0x30
+} ompt_frame_flag_t; 
+
+typedef enum ompt_state_t {
+  ompt_state_work_serial                      = 0x000,
+  ompt_state_work_parallel                    = 0x001,
+  ompt_state_work_reduction                   = 0x002,
+
+  ompt_state_wait_barrier                     = 0x010,
+  ompt_state_wait_barrier_implicit_parallel   = 0x011,
+  ompt_state_wait_barrier_implicit_workshare  = 0x012,
+  ompt_state_wait_barrier_implicit            = 0x013,
+  ompt_state_wait_barrier_explicit            = 0x014,
+
+  ompt_state_wait_taskwait                    = 0x020,
+  ompt_state_wait_taskgroup                   = 0x021,
+
+  ompt_state_wait_mutex                       = 0x040,
+  ompt_state_wait_lock                        = 0x041,
+  ompt_state_wait_critical                    = 0x042,
+  ompt_state_wait_atomic                      = 0x043,
+  ompt_state_wait_ordered                     = 0x044,
+
+  ompt_state_wait_target                      = 0x080,
+  ompt_state_wait_target_map                  = 0x081,
+  ompt_state_wait_target_update               = 0x082,
+
+  ompt_state_idle                             = 0x100,
+  ompt_state_overhead                         = 0x101,
+  ompt_state_undefined                        = 0x102
+} ompt_state_t;
+
+typedef uint64_t (*ompt_get_unique_id_t) (void);
+
+typedef uint64_t ompd_size_t;
+
+typedef uint64_t ompd_wait_id_t;
+
+typedef uint64_t ompd_addr_t;
+typedef int64_t  ompd_word_t;
+typedef uint64_t ompd_seg_t;
+
+typedef uint64_t ompd_device_t;
+
+typedef uint64_t ompd_thread_id_t;
+
+typedef enum ompd_scope_t {
+  ompd_scope_global = 1,
+  ompd_scope_address_space = 2,
+  ompd_scope_thread = 3,
+  ompd_scope_parallel = 4,
+  ompd_scope_implicit_task = 5,
+  ompd_scope_task = 6
+} ompd_scope_t;
+
+typedef uint64_t ompd_icv_id_t;
+
+typedef enum ompd_rc_t {
+  ompd_rc_ok = 0,
+  ompd_rc_unavailable = 1,
+  ompd_rc_stale_handle = 2,
+  ompd_rc_bad_input = 3,
+  ompd_rc_error = 4,
+  ompd_rc_unsupported = 5,
+  ompd_rc_needs_state_tracking = 6,
+  ompd_rc_incompatible = 7,
+  ompd_rc_device_read_error = 8,
+  ompd_rc_device_write_error = 9,
+  ompd_rc_nomem = 10,
+} ompd_rc_t;
+
+typedef void (*ompt_interface_fn_t) (void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
+  const char *interface_function_name
+);
+
+typedef union ompt_data_t {
+  uint64_t value;
+  void *ptr;
+} ompt_data_t;
+
+typedef struct ompt_frame_t {
+  ompt_data_t exit_frame;
+  ompt_data_t enter_frame;
+  int exit_frame_flags;
+  int enter_frame_flags;
+} ompt_frame_t;
+
+typedef void (*ompt_callback_t) (void);
+
+typedef void ompt_device_t;
+
+typedef void ompt_buffer_t;
+
+typedef void (*ompt_callback_buffer_request_t) (
+  int device_num,
+  ompt_buffer_t **buffer,
+  size_t *bytes
+);
+
+typedef void (*ompt_callback_buffer_complete_t) (
+  int device_num,
+  ompt_buffer_t *buffer,
+  size_t bytes,
+  ompt_buffer_cursor_t begin,
+  int buffer_owned
+);
+
+typedef void (*ompt_finalize_t) (
+  ompt_data_t *tool_data
+);
+
+typedef int (*ompt_initialize_t) (
+  ompt_function_lookup_t lookup,
+  int initial_device_num,
+  ompt_data_t *tool_data
+);
+
+typedef struct ompt_start_tool_result_t {
+  ompt_initialize_t initialize;
+  ompt_finalize_t finalize;
+  ompt_data_t tool_data;
+} ompt_start_tool_result_t;
+
+typedef struct ompt_record_abstract_t {
+  ompt_record_native_t rclass;
+  const char *type;
+  ompt_device_time_t start_time;
+  ompt_device_time_t end_time;
+  ompt_hwid_t hwid;
+} ompt_record_abstract_t;
+
+typedef struct ompt_dependence_t {
+  ompt_data_t variable;
+  ompt_dependence_type_t dependence_type;
+} ompt_dependence_t;
+
+typedef int (*ompt_enumerate_states_t) (
+  int current_state,
+  int *next_state,
+  const char **next_state_name
+);
+
+typedef int (*ompt_enumerate_mutex_impls_t) (
+  int current_impl,
+  int *next_impl,
+  const char **next_impl_name
+);
+
+typedef ompt_set_result_t (*ompt_set_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t callback
+);
+
+typedef int (*ompt_get_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t *callback
+);
+
+typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
+
+typedef int (*ompt_get_num_procs_t) (void);
+
+typedef int (*ompt_get_num_places_t) (void);
+
+typedef int (*ompt_get_place_proc_ids_t) (
+  int place_num,
+  int ids_size,
+  int *ids
+);
+
+typedef int (*ompt_get_place_num_t) (void);
+
+typedef int (*ompt_get_partition_place_nums_t) (
+  int place_nums_size,
+  int *place_nums
+);
+
+typedef int (*ompt_get_proc_id_t) (void);
+
+typedef int (*ompt_get_state_t) (
+  ompt_wait_id_t *wait_id
+);
+
+typedef int (*ompt_get_parallel_info_t) (
+  int ancestor_level,
+  ompt_data_t **parallel_data,
+  int *team_size
+);
+
+typedef int (*ompt_get_task_info_t) (
+  int ancestor_level,
+  int *flags,
+  ompt_data_t **task_data,
+  ompt_frame_t **task_frame,
+  ompt_data_t **parallel_data,
+  int *thread_num
+);
+
+typedef int (*ompt_get_task_memory_t)(
+  void **addr,
+  size_t *size,
+  int block
+);
+
+typedef int (*ompt_get_target_info_t) (
+  uint64_t *device_num,
+  ompt_id_t *target_id,
+  ompt_id_t *host_op_id
+);
+
+typedef int (*ompt_get_num_devices_t) (void);
+
+typedef void (*ompt_finalize_tool_t) (void);
+
+typedef int (*ompt_get_device_num_procs_t) (
+  ompt_device_t *device
+);
+
+typedef ompt_device_time_t (*ompt_get_device_time_t) (
+  ompt_device_t *device
+);
+
+typedef double (*ompt_translate_time_t) (
+  ompt_device_t *device,
+  ompt_device_time_t time
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
+  ompt_device_t *device,
+  unsigned int enable,
+  unsigned int etype
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_native_t) (
+  ompt_device_t *device,
+  int enable,
+  int flags
+);
+
+typedef int (*ompt_start_trace_t) (
+  ompt_device_t *device,
+  ompt_callback_buffer_request_t request,
+  ompt_callback_buffer_complete_t complete
+);
+
+typedef int (*ompt_pause_trace_t) (
+  ompt_device_t *device,
+  int begin_pause
+);
+
+typedef int (*ompt_flush_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_stop_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_advance_buffer_cursor_t) (
+  ompt_device_t *device,
+  ompt_buffer_t *buffer,
+  size_t size,
+  ompt_buffer_cursor_t current,
+  ompt_buffer_cursor_t *next
+);
+
+typedef ompt_record_t (*ompt_get_record_type_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+typedef void *(*ompt_get_record_native_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current,
+  ompt_id_t *host_op_id
+);
+
+typedef ompt_record_abstract_t *
+(*ompt_get_record_abstract_t) (
+  void *native_record
+);
+
+typedef void (*ompt_callback_thread_begin_t) (
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data
+);
+
+typedef struct ompt_record_thread_begin_t {
+  ompt_thread_t thread_type;
+} ompt_record_thread_begin_t;
+
+typedef void (*ompt_callback_thread_end_t) (
+  ompt_data_t *thread_data
+);
+
+typedef void (*ompt_callback_parallel_begin_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *parallel_data,
+  unsigned int requested_parallelism,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_begin_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t parallel_id;
+  unsigned int requested_parallelism;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_begin_t;
+
+typedef void (*ompt_callback_parallel_end_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *encountering_task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_end_t {
+  ompt_id_t parallel_id;
+  ompt_id_t encountering_task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_end_t;
+
+typedef void (*ompt_callback_work_t) (
+  ompt_work_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_work_t {
+  ompt_work_t wstype;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  uint64_t count;
+  const void *codeptr_ra;
+} ompt_record_work_t;
+
+typedef void (*ompt_callback_dispatch_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  ompt_dispatch_t kind,
+  ompt_data_t instance 
+);
+
+typedef struct ompt_record_dispatch_t {
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  ompt_dispatch_t kind;
+  ompt_data_t instance; 
+} ompt_record_dispatch_t;
+
+typedef void (*ompt_callback_task_create_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *new_task_data,
+  int flags,
+  int has_dependences,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_task_create_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t new_task_id;
+  int flags;
+  int has_dependences;
+  const void *codeptr_ra;
+} ompt_record_task_create_t;
+
+typedef void (*ompt_callback_dependences_t) (
+  ompt_data_t *task_data,
+  const ompt_dependence_t *deps,
+  int ndeps
+);
+
+typedef struct ompt_record_dependences_t {
+  ompt_id_t task_id;
+  ompt_dependence_t dep;
+  int ndeps;
+} ompt_record_dependences_t;
+
+typedef void (*ompt_callback_task_dependence_t) (
+  ompt_data_t *src_task_data,
+  ompt_data_t *sink_task_data
+);
+
+typedef struct ompt_record_task_dependence_t {
+  ompt_id_t src_task_id;
+  ompt_id_t sink_task_id;
+} ompt_record_task_dependence_t;
+
+typedef void (*ompt_callback_task_schedule_t) (
+  ompt_data_t *prior_task_data,
+  ompt_task_status_t prior_task_status,
+  ompt_data_t *next_task_data
+);
+
+typedef struct ompt_record_task_schedule_t {
+  ompt_id_t prior_task_id;
+  ompt_task_status_t prior_task_status;
+  ompt_id_t next_task_id;
+} ompt_record_task_schedule_t;
+
+typedef void (*ompt_callback_implicit_task_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  unsigned int actual_parallelism,
+  unsigned int index,
+  int flags
+);
+
+typedef struct ompt_record_implicit_task_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  unsigned int actual_parallelism;
+  unsigned int index;
+  int flags;
+} ompt_record_implicit_task_t;
+
+typedef void (*ompt_callback_master_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_master_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_master_t;
+
+typedef void (*ompt_callback_sync_region_t) (
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_sync_region_t {
+  ompt_sync_region_t kind;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_sync_region_t;
+
+typedef void (*ompt_callback_mutex_acquire_t) (
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_acquire_t {
+  ompt_mutex_t kind;
+  unsigned int hint;
+  unsigned int impl;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_acquire_t;
+
+typedef void (*ompt_callback_mutex_t) (
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_t {
+  ompt_mutex_t kind;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_t;
+
+typedef void (*ompt_callback_nest_lock_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_nest_lock_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_nest_lock_t;
+
+typedef void (*ompt_callback_flush_t) (
+  ompt_data_t *thread_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_flush_t {
+  const void *codeptr_ra;
+} ompt_record_flush_t;
+
+typedef void (*ompt_callback_cancel_t) (
+  ompt_data_t *task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_cancel_t {
+  ompt_id_t task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_cancel_t;
+
+typedef void (*ompt_callback_device_initialize_t) (
+  int device_num,
+  const char *type,
+  ompt_device_t *device,
+  ompt_function_lookup_t lookup,
+  const char *documentation
+);
+
+typedef void (*ompt_callback_device_finalize_t) (
+  int device_num
+);
+
+typedef void (*ompt_callback_device_load_t) (
+  int device_num,
+  const char *filename,
+  int64_t offset_in_file,
+  void *vma_in_file,
+  size_t bytes,
+  void *host_addr,
+  void *device_addr,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_device_unload_t) (
+  int device_num,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_target_data_op_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_data_op_t {
+  ompt_id_t host_op_id;
+  ompt_target_data_op_t optype;
+  void *src_addr;
+  int src_device_num;
+  void *dest_addr;
+  int dest_device_num;
+  size_t bytes;
+  ompt_device_time_t end_time;
+  const void *codeptr_ra;
+} ompt_record_target_data_op_t;
+
+typedef void (*ompt_callback_target_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_id_t target_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_t {
+  ompt_target_t kind;
+  ompt_scope_endpoint_t endpoint;
+  int device_num;
+  ompt_id_t task_id;
+  ompt_id_t target_id;
+  const void *codeptr_ra;
+} ompt_record_target_t;
+
+typedef void (*ompt_callback_target_map_t) (
+  ompt_id_t target_id,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_map_t {
+  ompt_id_t target_id;
+  unsigned int nitems;
+  void **host_addr;
+  void **device_addr;
+  size_t *bytes;
+  unsigned int *mapping_flags;
+  const void *codeptr_ra;
+} ompt_record_target_map_t;
+
+typedef void (*ompt_callback_target_submit_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  unsigned int requested_num_teams
+);
+
+typedef struct ompt_record_target_kernel_t {
+  ompt_id_t host_op_id;
+  unsigned int requested_num_teams;
+  unsigned int granted_num_teams;
+  ompt_device_time_t end_time;
+} ompt_record_target_kernel_t;
+
+typedef int (*ompt_callback_control_tool_t) (
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_control_tool_t {
+  uint64_t command;
+  uint64_t modifier;
+  const void *codeptr_ra;
+} ompt_record_control_tool_t;
+
+typedef struct ompd_address_t {
+  ompd_seg_t segment;
+  ompd_addr_t address;
+} ompd_address_t;
+
+typedef struct ompd_frame_info_t {
+  ompd_address_t frame_address;
+  ompd_word_t frame_flag;
+} ompd_frame_info_t;
+
+typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
+typedef struct _ompd_thread_handle ompd_thread_handle_t;
+typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
+typedef struct _ompd_task_handle ompd_task_handle_t;
+
+typedef struct _ompd_aspace_cont ompd_address_space_context_t;
+typedef struct _ompd_thread_cont ompd_thread_context_t;
+
+typedef struct ompd_device_type_sizes_t {
+  uint8_t sizeof_char;
+  uint8_t sizeof_short;
+  uint8_t sizeof_int;
+  uint8_t sizeof_long;
+  uint8_t sizeof_long_long;
+  uint8_t sizeof_pointer;
+} ompd_device_type_sizes_t;
+
+typedef struct ompt_record_ompt_t {
+  ompt_callbacks_t type;
+  ompt_device_time_t time;
+  ompt_id_t thread_id;
+  ompt_id_t target_id;
+  union {
+    ompt_record_thread_begin_t thread_begin;
+    ompt_record_parallel_begin_t parallel_begin;
+    ompt_record_parallel_end_t parallel_end;
+    ompt_record_work_t work;
+    ompt_record_dispatch_t dispatch;
+    ompt_record_task_create_t task_create;
+    ompt_record_dependences_t dependences;
+    ompt_record_task_dependence_t task_dependence;
+    ompt_record_task_schedule_t task_schedule;
+    ompt_record_implicit_task_t implicit_task;
+    ompt_record_master_t master;
+    ompt_record_sync_region_t sync_region;
+    ompt_record_mutex_acquire_t mutex_acquire;
+    ompt_record_mutex_t mutex;
+    ompt_record_nest_lock_t nest_lock;
+    ompt_record_flush_t flush;
+    ompt_record_cancel_t cancel;
+    ompt_record_target_t target;
+    ompt_record_target_data_op_t target_data_op;
+    ompt_record_target_map_t target_map;
+    ompt_record_target_kernel_t target_kernel;
+    ompt_record_control_tool_t control_tool;
+  } record;
+} ompt_record_ompt_t;
+
+typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+#define ompt_id_none 0
+#define ompt_data_none {0}
+#define ompt_time_none 0
+#define ompt_hwid_none 0
+#define ompt_addr_none ~0
+#define ompt_mutex_impl_none 0
+#define ompt_wait_id_none 0
+
+#define ompd_segment_none 0
+
+#endif /* __OMPT__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6f039db8effce996015f901562009ebe976d832
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h
@@ -0,0 +1,588 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CHANNEL_DESCRIPTOR_H__)
+#define __CHANNEL_DESCRIPTOR_H__
+
+#if defined(__cplusplus)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ *
+ * @{
+ */
+
+/**
+ * \brief \hl Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
+ * ::cudaChannelFormatKindSignedNormalized8X4,
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
+ * ::cudaChannelFormatKindSignedNormalized16X4,
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
+ * ::cudaChannelFormatKindUnsignedNormalized16X4
+ * or ::cudaChannelFormatKindNV12.
+ *
+ * The format is specified by the template specialization.
+ *
+ * The template function specializes for the following scalar types:
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
+ * The template function specializes for the following vector types:
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
+ * The template function specializes for following cudaChannelFormatKind enum values:
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
+ *
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
+ * ::cudaGetChannelDesc, 
+ */
+template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
+{
+  int e = (int)sizeof(char) * 8;
+
+#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#if !defined(__LP64__)
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#endif /* !__LP64__ */
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
+{
+    int e = (int)sizeof(char) * 8;
+
+    return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
+}
+
+template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+    return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+/* Signed 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
+}
+
+/* Unsigned 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
+}
+
+/* Signed 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
+}
+
+/* Unsigned 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
+}
+
+/* NV12 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
+}
+
+/* BC1 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
+}
+
+/* BC1sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
+}
+
+/* BC2 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
+}
+
+/* BC2sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
+}
+
+/* BC3 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
+}
+
+/* BC3sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
+}
+
+/* BC4 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
+}
+
+/* BC4 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
+}
+
+/* BC5 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
+}
+
+/* BC5 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
+}
+
+/* BC6H unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
+}
+
+/* BC6H signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
+}
+
+/* BC7 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
+}
+
+/* BC7sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
+}
+
+#endif /* __cplusplus */
+
+/** @} */
+/** @} */ /* END CUDART_TEXTURE_HL */
+
+#endif /* !__CHANNEL_DESCRIPTOR_H__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h
new file mode 100644
index 0000000000000000000000000000000000000000..e04314301c1cca9e2514f56367e1dbc45cfdad69
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h
@@ -0,0 +1,1730 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _COOPERATIVE_GROUPS_H_
+#define _COOPERATIVE_GROUPS_H_
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "cooperative_groups/details/info.h"
+#include "cooperative_groups/details/driver_abi.h"
+#include "cooperative_groups/details/helpers.h"
+#include "cooperative_groups/details/memory.h"
+
+#if defined(_CG_HAS_STL_ATOMICS)
+#include <cuda/atomic>
+#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
+#else
+#define _CG_THREAD_SCOPE(scope)
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    _CG_CONST_DECL unsigned int coalesced_group_id = 1;
+    _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
+    _CG_CONST_DECL unsigned int grid_group_id = 3;
+    _CG_CONST_DECL unsigned int thread_block_id = 4;
+    _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
+    _CG_CONST_DECL unsigned int cluster_group_id = 6;
+}
+
+/**
+ * class thread_group;
+ *
+ * Generic thread group type, into which all groups are convertible.
+ * It acts as a container for all storage necessary for the derived groups,
+ * and will dispatch the API calls to the correct derived group. This means
+ * that all derived groups must implement the same interface as thread_group.
+ */
+class thread_group
+{
+protected:
+    struct group_data {
+        unsigned int _unused : 1;
+        unsigned int type : 7, : 0;
+    };
+
+    struct gg_data  {
+        details::grid_workspace *gridWs;
+    };
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    struct mg_data  {
+        unsigned long long _unused : 1;
+        unsigned long long type    : 7;
+        unsigned long long handle  : 56;
+        const details::multi_grid::multi_grid_functions *functions;
+    };
+#endif
+
+    struct tg_data {
+        unsigned int is_tiled : 1;
+        unsigned int type : 7;
+        unsigned int size : 24;
+        // packed to 4b
+        unsigned int metaGroupSize : 16;
+        unsigned int metaGroupRank : 16;
+        // packed to 8b
+        unsigned int mask;
+        // packed to 12b
+        unsigned int _res;
+    };
+
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend class thread_block;
+
+    union __align__(8) {
+        group_data  group;
+        tg_data     coalesced;
+        gg_data     grid;
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+        mg_data     multi_grid;
+#endif
+    } _data;
+
+    _CG_QUALIFIER thread_group operator=(const thread_group& src);
+
+    _CG_QUALIFIER thread_group(unsigned int type) {
+        _data.group.type = type;
+        _data.group._unused = false;
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(sizeof(tg_data) <= 16, "Failed size check");
+    static_assert(sizeof(gg_data) <= 16, "Failed size check");
+#  ifdef _CG_ABI_EXPERIMENTAL
+    static_assert(sizeof(mg_data) <= 16, "Failed size check");
+#  endif
+#endif
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER unsigned long long size() const;
+    _CG_QUALIFIER unsigned long long num_threads() const;
+    _CG_QUALIFIER unsigned long long thread_rank() const;
+    _CG_QUALIFIER void sync() const;
+    _CG_QUALIFIER unsigned int get_type() const {
+        return _data.group.type;
+    }
+
+};
+
+template <unsigned int TyId>
+struct thread_group_base : public thread_group {
+    _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
+    _CG_STATIC_CONST_DECL unsigned int id = TyId;
+};
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+/**
+ * class multi_grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same system, on multiple devices within the same launched kernels.
+ * To use this group, the kernel must have been launched with
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_multi_grid();
+ */
+
+
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+class multi_grid_group;
+
+// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
+template <typename = void>
+__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
+
+class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
+{
+private:
+    template <typename = void>
+    _CG_QUALIFIER multi_grid_group() {
+        _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
+        _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
+    }
+
+    friend multi_grid_group this_multi_grid<void>();
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.multi_grid.handle != 0);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        _data.multi_grid.functions->sync(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->size(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
+    }
+
+    _CG_QUALIFIER unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
+    }
+};
+# else
+class multi_grid_group
+{
+private:
+    unsigned long long _handle;
+    unsigned int _size;
+    unsigned int _rank;
+
+    friend _CG_QUALIFIER multi_grid_group this_multi_grid();
+
+    _CG_QUALIFIER multi_grid_group() {
+        _handle = details::multi_grid::get_intrinsic_handle();
+        _size = details::multi_grid::size(_handle);
+        _rank = details::multi_grid::thread_rank(_handle);
+    }
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
+        return (_handle != 0);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::multi_grid::sync(_handle);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _size;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _rank;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::grid_rank(_handle));
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::num_grids(_handle));
+    }
+};
+# endif
+
+/**
+ * multi_grid_group this_multi_grid()
+ *
+ * Constructs a multi_grid_group
+ */
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+template <typename>
+__device__
+#else
+_CG_QUALIFIER
+# endif
+_CG_DEPRECATED
+multi_grid_group this_multi_grid()
+{
+    return multi_grid_group();
+}
+#endif
+
+/**
+ * class grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same device within the same launched kernel. To use this group, the kernel
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_grid();
+ */
+class grid_group : public thread_group_base<details::grid_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
+    friend _CG_QUALIFIER grid_group this_grid();
+
+private:
+    _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
+        _data.grid.gridWs = gridWs;
+    }
+
+ public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.grid.gridWs != NULL);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::grid::sync(&_data.grid.gridWs->barrier);
+    }
+
+#if defined(_CG_CPP11_FEATURES)
+    using arrival_token = unsigned int;
+
+    _CG_QUALIFIER arrival_token barrier_arrive() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
+    }
+
+    _CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
+        details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
+    }
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned long long size() {
+        return details::grid::size();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::grid::grid_dim();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::grid::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_threads() {
+        return details::grid::num_threads();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::grid::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
+        return details::grid::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks() {
+        return details::grid::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
+        return details::grid::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index() {
+        return details::grid::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long block_rank() {
+        return details::grid::block_rank();
+    }
+
+# if defined(_CG_HAS_CLUSTER_GROUP)
+    _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+        return details::grid::dim_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+        return details::grid::num_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 cluster_index() {
+        return details::grid::cluster_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+        return details::grid::cluster_rank();
+    }
+# endif
+};
+
+_CG_QUALIFIER grid_group this_grid() {
+    // Load a workspace from the driver
+    grid_group gg(details::get_grid_workspace());
+#ifdef _CG_DEBUG
+    // *all* threads must be available to synchronize
+    gg.sync();
+#endif // _CG_DEBUG
+    return gg;
+}
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+/**
+ * class cluster_group
+ *
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
+ * divided along all dimensions to form groups of blocks, each group of which is
+ * a block cluster. Clustered grids are subject to various restrictions and
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
+ * grids are subject to additional occupancy limitations due to per-cluster
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
+ * be a cooperative group, with access to all cooperative group capabilities, as
+ * well as cluster specific capabilities and accelerations. A cluster_group
+ * represents a block cluster.
+ *
+ * Constructed via this_cluster_group();
+ */
+class cluster_group : public thread_group_base<details::cluster_group_id>
+{
+    // Friends
+    friend _CG_QUALIFIER cluster_group this_cluster();
+
+    // Disable constructor
+    _CG_QUALIFIER cluster_group()
+    {
+    }
+
+ public:
+    //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
+
+    using arrival_token = struct {};
+
+    // Functionality exposed by the group
+    _CG_STATIC_QUALIFIER void sync()
+    {
+        return details::cluster::sync();
+    }
+
+    _CG_STATIC_QUALIFIER arrival_token barrier_arrive()
+    {
+        details::cluster::barrier_arrive();
+        return arrival_token();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_wait()
+    {
+        return details::cluster::barrier_wait();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
+    {
+        return details::cluster::barrier_wait();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+    {
+        return details::cluster::query_shared_rank(addr);
+    }
+
+    template <typename T>
+    _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+    {
+        return details::cluster::map_shared_rank(addr, rank);
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index()
+    {
+        return details::cluster::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int block_rank()
+    {
+        return details::cluster::block_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index()
+    {
+        return details::cluster::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank()
+    {
+        return details::cluster::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks()
+    {
+        return details::cluster::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_blocks()
+    {
+        return details::cluster::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads()
+    {
+        return details::cluster::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads()
+    {
+        return details::cluster::num_threads();
+    }
+
+    // Legacy aliases
+    _CG_STATIC_QUALIFIER unsigned int size()
+    {
+        return num_threads();
+    }
+};
+
+/*
+ * cluster_group this_cluster()
+ *
+ * Constructs a cluster_group
+ */
+_CG_QUALIFIER cluster_group this_cluster()
+{
+    cluster_group cg;
+#ifdef _CG_DEBUG
+    cg.sync();
+#endif
+    return cg;
+}
+#endif
+
+#if defined(_CG_CPP11_FEATURES)
+class thread_block;
+template <unsigned int MaxBlockSize>
+_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
+#endif
+
+/**
+ * class thread_block
+ *
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
+ * each block are guaranteed to reside on the same streaming multiprocessor.
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
+ *
+ * Constructed via this_thread_block();
+ */
+class thread_block : public thread_group_base<details::thread_block_id>
+{
+    // Friends
+    friend _CG_QUALIFIER thread_block this_thread_block();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
+
+#if defined(_CG_CPP11_FEATURES)
+    template <unsigned int MaxBlockSize>
+    friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
+    template <unsigned int Size>
+    friend class __static_size_multi_warp_tile_base;
+
+    details::multi_warp_scratch* const tile_memory;
+
+    template <unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
+        tile_memory(details::get_scratch_ptr(&scratch)) {
+#ifdef _CG_DEBUG
+        if (num_threads() > MaxBlockSize) {
+            details::abort();
+        }
+#endif
+#if !defined(_CG_HAS_RESERVED_SHARED)
+        tile_memory->init_barriers(thread_rank());
+        sync();
+#endif
+    }
+#endif
+
+    // Disable constructor
+    _CG_QUALIFIER thread_block()
+#if defined(_CG_CPP11_FEATURES)
+    : tile_memory(details::get_scratch_ptr(NULL))
+#endif
+    { }
+
+    // Internal Use
+    _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (thread_block());
+        }
+
+        unsigned int mask;
+        unsigned int base_offset = thread_rank() & (~(tilesz - 1));
+        unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+
+        mask = (unsigned int)(-1) >> (32 - masklength);
+        mask <<= (details::laneid() & ~(tilesz - 1));
+        thread_group tile = thread_group(details::coalesced_group_id);
+        tile._data.coalesced.mask = mask;
+        tile._data.coalesced.size = __popc(mask);
+        tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
+        tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
+        tile._data.coalesced.is_tiled = true;
+        return (tile);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_STATIC_QUALIFIER void sync() {
+        details::cta::sync();
+    }
+
+#if defined(_CG_CPP11_FEATURES)
+    struct arrival_token {};
+
+    _CG_QUALIFIER arrival_token barrier_arrive() const {
+        return arrival_token();
+    }
+
+    _CG_QUALIFIER void barrier_wait(arrival_token&&) const {
+        details::cta::sync();
+    }
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned int size() {
+        return details::cta::size();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return details::cta::thread_rank();
+    }
+
+    // Additional functionality exposed by the group
+    _CG_STATIC_QUALIFIER dim3 group_index() {
+        return details::cta::group_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::cta::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::cta::block_dim();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::cta::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads() {
+        return details::cta::num_threads();
+    }
+
+};
+
+/**
+ * thread_block this_thread_block()
+ *
+ * Constructs a thread_block group
+ */
+_CG_QUALIFIER thread_block this_thread_block()
+{
+    return (thread_block());
+}
+
+#if defined(_CG_CPP11_FEATURES)
+template <unsigned int MaxBlockSize>
+_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
+    return (thread_block(scratch));
+}
+#endif
+
+/**
+ * class coalesced_group
+ *
+ * A group representing the current set of converged threads in a warp.
+ * The size of the group is not guaranteed and it may return a group of
+ * only one thread (itself).
+ *
+ * This group exposes warp-synchronous builtins.
+ * Constructed via coalesced_threads();
+ */
+class coalesced_group : public thread_group_base<details::coalesced_group_id>
+{
+private:
+    friend _CG_QUALIFIER coalesced_group coalesced_threads();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
+    friend class details::_coalesced_group_data_access;
+
+    _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
+        unsigned int member_pack = 0;
+        unsigned int member_rank = 0;
+        for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
+            unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+            if (lane_bit) {
+                if (laneMask & lane_bit)
+                    member_pack |= 1 << member_rank;
+                member_rank++;
+            }
+        }
+        return (member_pack);
+    }
+
+    // Internal Use
+    _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (coalesced_group(0));
+        }
+        if (size() <= tilesz) {
+            return (*this);
+        }
+
+        if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
+            unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
+            unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+            unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
+
+            mask <<= (details::laneid() & ~(tilesz - 1));
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            coalesced_tile._data.coalesced.is_tiled = true;
+            return (coalesced_tile);
+        }
+        else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
+            unsigned int mask = 0;
+            unsigned int member_rank = 0;
+            int seen_lanes = (thread_rank() / tilesz) * tilesz;
+            for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
+                unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+                if (lane_bit) {
+                    if (seen_lanes <= 0 && member_rank < tilesz) {
+                        mask |= lane_bit;
+                        member_rank++;
+                    }
+                    seen_lanes--;
+                }
+            }
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            // Override parent with the size of this group
+            coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            return coalesced_tile;
+        }
+        else {
+            // None in _CG_VERSION 1000
+            details::abort();
+        }
+
+        return (coalesced_group(0));
+    }
+
+ protected:
+    _CG_QUALIFIER coalesced_group(unsigned int mask) {
+        _data.coalesced.mask = mask;
+        _data.coalesced.size = __popc(mask);
+        _data.coalesced.metaGroupRank = 0;
+        _data.coalesced.metaGroupSize = 1;
+        _data.coalesced.is_tiled = false;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_QUALIFIER unsigned int num_threads() const {
+        return _data.coalesced.size;
+    }
+
+    _CG_QUALIFIER unsigned int size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
+    }
+
+    // Rank of this group in the upper level of the hierarchy
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+
+    _CG_QUALIFIER void sync() const {
+        __syncwarp(_data.coalesced.mask);
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+#endif
+
+    _CG_QUALIFIER int any(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        if (size() == 32) {
+            return (__ballot_sync(0xFFFFFFFF, predicate));
+        }
+        unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
+        return (_packLanes(lane_ballot));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_any_sync(0xFFFFFFFF, val));
+        }
+        unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
+        return (_packLanes(lane_match));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_all_sync(0xFFFFFFFF, val, &pred));
+        }
+        unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
+        return (_packLanes(lane_match));
+    }
+
+#endif /* !_CG_HAS_MATCH_COLLECTIVE */
+
+};
+
+_CG_QUALIFIER coalesced_group coalesced_threads()
+{
+    return (coalesced_group(__activemask()));
+}
+
+namespace details {
+    template <unsigned int Size> struct verify_thread_block_tile_size;
+    template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<8>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<4>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<2>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<1>  { typedef void OK; };
+
+#ifdef _CG_CPP11_FEATURES
+    template <unsigned int Size>
+    using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
+
+    template <unsigned int Size>
+    using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
+    template <unsigned int Size>
+    using _is_multi_warp =
+    _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
+
+    template <unsigned int Size>
+    using _is_valid_single_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
+    template <unsigned int Size>
+    using _is_valid_multi_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
+#else
+    template <unsigned int Size>
+    struct _is_multi_warp {
+        static const bool value = false;
+    };
+#endif
+}
+
+template <unsigned int Size>
+class __static_size_tile_base
+{
+protected:
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    // Rank of thread within tile
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return (details::cta::thread_rank() & (numThreads - 1));
+    }
+
+    // Number of threads within tile
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
+        return numThreads;
+    }
+
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
+        return num_threads();
+    }
+};
+
+template <unsigned int Size>
+class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
+{
+    friend class details::_coalesced_group_data_access;
+    typedef details::tile::tile_helpers<Size> th;
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
+#else
+    typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
+#endif
+    using __static_size_tile_base<Size>::numThreads;
+    _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
+
+ protected:
+    _CG_STATIC_QUALIFIER unsigned int build_mask() {
+        unsigned int mask = fullMask;
+        if (numThreads != 32) {
+            // [0,31] representing the current active thread in the warp
+            unsigned int laneId = details::laneid();
+            // shift mask according to the partition it belongs to
+            mask = th::tileMask << (laneId & ~(th::laneMask));
+        }
+        return (mask);
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+
+    _CG_STATIC_QUALIFIER void sync() {
+        __syncwarp(build_mask());
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    // PTX supported collectives
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+#endif //_CG_CPP11_FEATURES
+
+    _CG_QUALIFIER int any(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot == build_mask());
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_any_sync(build_mask(), val);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+#endif
+
+};
+
+template <unsigned int Size, typename ParentT>
+class __static_parent_thread_block_tile_base
+{
+public:
+    // Rank of this group in the upper level of the hierarchy
+    _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
+        return ParentT::thread_rank() / Size;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
+        return (ParentT::size() + Size - 1) / Size;
+    }
+};
+
+/**
+ * class thread_block_tile<unsigned int Size, ParentT = void>
+ *
+ * Statically-sized group type, representing one tile of a thread block.
+ * The only specializations currently supported are those with native
+ * hardware support (1/2/4/8/16/32)
+ *
+ * This group exposes warp-synchronous builtins.
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
+ */
+
+template <unsigned int Size, typename ParentT = void>
+class __single_warp_thread_block_tile :
+    public __static_size_thread_block_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    friend class details::_coalesced_group_data_access;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile() { };
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
+
+    _CG_STATIC_QUALIFIER unsigned int get_mask() {
+        return __static_size_thread_block_tile_base<Size>::build_mask();
+    }
+};
+
+template <unsigned int Size>
+class __single_warp_thread_block_tile<Size, void> :
+    public __static_size_thread_block_tile_base<Size>,
+    public thread_group_base<details::coalesced_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+    template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
+    friend class details::_coalesced_group_data_access;
+
+    typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
+        _data.coalesced.mask = staticSizeBaseT::build_mask();
+        _data.coalesced.size = numThreads;
+        _data.coalesced.metaGroupRank = meta_group_rank;
+        _data.coalesced.metaGroupSize = meta_group_size;
+        _data.coalesced.is_tiled = true;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+public:
+    using staticSizeBaseT::sync;
+    using staticSizeBaseT::size;
+    using staticSizeBaseT::num_threads;
+    using staticSizeBaseT::thread_rank;
+
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+};
+
+/**
+ * Outer level API calls
+ * void sync(GroupT) - see <group_type>.sync()
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
+ * void group_size(GroupT) - see <group_type>.size()
+ */
+template <class GroupT>
+_CG_QUALIFIER void sync(GroupT const &g)
+{
+    g.sync();
+}
+
+// TODO: Use a static dispatch to determine appropriate return type
+// C++03 is stuck with unsigned long long for now
+#ifdef _CG_CPP11_FEATURES
+template <class GroupT>
+_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
+    return g.thread_rank();
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
+    return g.num_threads();
+}
+#else
+template <class GroupT>
+_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
+    return static_cast<unsigned long long>(g.thread_rank());
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
+    return static_cast<unsigned long long>(g.num_threads());
+}
+#endif
+
+
+/**
+ * tiled_partition
+ *
+ * The tiled_partition(parent, tilesz) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
+ * will be members of the same subgroup.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to power-of-two sized subgorup instances of at most
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
+ * tiled_partition() in _CG_VERSION 1000.
+ */
+_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
+{
+    if (parent.get_type() == details::coalesced_group_id) {
+        const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
+        return _cg->_get_tiled_threads(tilesz);
+    }
+    else {
+        const thread_block *_tb = static_cast<const thread_block*>(&parent);
+        return _tb->_get_tiled_threads(tilesz);
+    }
+}
+
+// Thread block type overload: returns a basic thread_group for now (may be specialized later)
+_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+// Coalesced group type overload: retains its ability to stay coalesced
+_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
+
+    template <unsigned int Size, typename ParentT>
+    _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
+        return internal_thread_block_tile<Size, ParentT>();
+    }
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda) {
+                return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
+            }
+
+    template <typename T, typename GroupT>
+    _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
+        return group.template get_scratch_location<T>(warp_id);
+    }
+
+    template <typename GroupT>
+    _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
+        return group.get_sync_location();
+    }
+
+}
+/**
+ * tiled_partition<tilesz>
+ *
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)/tilesz) subgroups will be created,
+ * therefore the parent group size must be evenly divisible by the tilesz.
+ * The allow parent groups are thread_block or thread_block_tile<size>.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
+ * The size(parent) must be greater than the template Size parameter
+ * otherwise the results are undefined.
+ */
+
+#if defined(_CG_CPP11_FEATURES)
+template <unsigned int Size>
+class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
+{
+    static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    friend __device__ TyVal details::multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda);
+    template <typename T, typename GroupT>
+    friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
+    template <typename GroupT>
+    friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
+    template <unsigned int OtherSize>
+    friend class __static_size_multi_warp_tile_base;
+    using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+    using ThisType = __static_size_multi_warp_tile_base<Size>;
+    _CG_STATIC_CONST_DECL int numWarps = Size / 32;
+
+protected:
+    details::multi_warp_scratch* const tile_memory;
+
+    template <typename GroupT>
+    _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
+#if defined(_CG_HAS_RESERVED_SHARED)
+        details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
+        g.sync();
+#endif
+    }
+
+
+private:
+    _CG_QUALIFIER details::barrier_t* get_sync_location() const {
+        // Different group sizes use different barriers, all groups of a given size share one barrier.
+        unsigned int sync_id = details::log2(Size / 64);
+        return &tile_memory->barriers[sync_id];
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
+        unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
+        return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location() const {
+        unsigned int scratch_id = details::cta::thread_rank() / 32;
+        return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
+    }
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
+        unsigned int src_warp = src / 32;
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+
+        // Get warp slot of the source threads warp.
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
+
+        if (warp.meta_group_rank() == src_warp) {
+            warp.sync();
+            // Put shuffled value into my warp slot and let my warp arrive at the barrier.
+            if (thread_rank() == src) {
+                *warp_scratch_location = val;
+            }
+            details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
+            TyVal result = *warp_scratch_location;
+            details::sync_warps_wait(sync_location, details::cta::thread_rank());
+            return result;
+        }
+        else {
+            // Wait for the source warp to arrive on the barrier.
+            details::sync_warps_wait_for_specific_warp(sync_location,
+                    (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
+            TyVal result = *warp_scratch_location;
+            details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
+            return result;
+        }
+    }
+
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
+        static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
+                      "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>();
+
+        warp_lambda(warp, warp_scratch_location);
+
+        if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
+            auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
+            if (subwarp.meta_group_rank() == 0) {
+                TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
+                inter_warp_lambda(subwarp, thread_scratch_location);
+            }
+            warp.sync();
+            details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
+        }
+        TyVal result = *warp_scratch_location;
+        return result;
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
+
+    using __static_size_tile_base<Size>::thread_rank;
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
+        static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
+                      "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
+        return shfl_impl(val, src);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
+    }
+
+    _CG_QUALIFIER int any(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+
+    _CG_QUALIFIER int all(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+};
+
+
+template <unsigned int Size, typename ParentT = void>
+class __multi_warp_thread_block_tile :
+    public __static_size_multi_warp_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
+protected:
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
+        __static_size_multi_warp_tile_base<Size>(g) {}
+};
+
+template <unsigned int Size>
+class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
+{
+    const unsigned int metaGroupRank;
+    const unsigned int metaGroupSize;
+
+protected:
+    template <unsigned int OtherSize, typename ParentT>
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
+        __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
+
+public:
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return metaGroupSize;
+    }
+};
+#endif
+
+template <unsigned int Size, typename ParentT = void>
+class thread_block_tile;
+
+namespace details {
+    template <unsigned int Size, typename ParentT, bool IsMultiWarp>
+    class thread_block_tile_impl;
+
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
+    {
+    protected:
+        template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
+            __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
+
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
+            __single_warp_thread_block_tile<Size, ParentT>() {}
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
+            __multi_warp_thread_block_tile<Size, ParentT>(g) {}
+    };
+#else
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
+    };
+#endif
+}
+
+template <unsigned int Size, typename ParentT>
+class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
+{
+    friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
+
+protected:
+    _CG_QUALIFIER thread_block_tile(const ParentT& g) :
+        details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
+        return thread_block_tile<Size, void>(*this);
+    }
+};
+
+template <unsigned int Size>
+class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
+{
+    template <unsigned int, typename ParentT>
+    friend class thread_block_tile;
+
+protected:
+    template <unsigned int OtherSize, typename OtherParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    template <typename ParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+};
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    struct tiled_partition_impl;
+
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
+        _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
+            thread_block_tile<Size, thread_block>(g) {}
+    };
+
+    // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
+    template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
+    struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
+        public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
+#endif
+        _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
+            thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
+    };
+
+}
+
+template <unsigned int Size, typename ParentT>
+_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
+{
+    return details::tiled_partition_impl<Size, ParentT>(g);
+}
+
+/**
+ * thread_group this_thread()
+ *
+ * Constructs a generic thread_group containing only the calling thread
+ */
+_CG_QUALIFIER thread_block_tile<1, void> this_thread()
+{
+    // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
+    // meta group rank and size set to 0 and 1 respectively.
+    return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
+}
+
+/**
+ * <group_type>.sync()
+ *
+ * Executes a barrier across the group
+ *
+ * Implements both a compiler fence and an architectural fence to prevent,
+ * memory reordering around the barrier.
+ */
+_CG_QUALIFIER void thread_group::sync() const
+{
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        cooperative_groups::sync(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        cooperative_groups::sync(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        cooperative_groups::sync(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+/**
+ * <group_type>.size()
+ *
+ * Returns the total number of threads in the group.
+ */
+_CG_QUALIFIER unsigned long long thread_group::size() const
+{
+    unsigned long long size = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return size;
+}
+
+/**
+ * <group_type>.thread_rank()
+ *
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
+ */
+_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
+{
+    unsigned long long rank = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return rank;
+}
+
+_CG_END_NAMESPACE
+
+#include <cooperative_groups/details/partitioning.h>
+#if (!defined(_MSC_VER) || defined(_WIN64))
+# include <cooperative_groups/details/invoke.h>
+#endif
+
+# endif /* ! (__cplusplus, __CUDACC__) */
+
+#endif /* !_COOPERATIVE_GROUPS_H_ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7dcb2433f2cb7d1ef61290995ac871a901b1e8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h
@@ -0,0 +1,452 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * The source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * The Licensed Deliverables contained herein are PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_ASYNC_H
+#define _CG_ASYNC_H
+
+#include "helpers.h"
+#include "info.h"
+
+#include <cuda_pipeline.h>
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+// Groups supported by memcpy_async
+template <class TyGroup>
+struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
+
+// Groups that require optimization
+template <class TyGroup>
+struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
+
+template <typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
+    : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
+
+// SFINAE helpers for tile optimizations
+template <class TyGroup>
+using enable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+template <class TyGroup>
+using disable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+// Segment for punning to aligned types
+template <unsigned int N>
+struct _Segment {
+    int _seg[N];
+};
+
+// Trivial layout guaranteed-aligned copy-async compatible segments
+template <unsigned int N>
+struct Segment;
+template <>
+struct __align__(4) Segment<1> : public _Segment<1>{};
+template <>
+struct __align__(8) Segment<2> : public _Segment<2>{};
+template <>
+struct __align__(16) Segment<4> : public _Segment<4>{};
+
+// Interleaved element by element copies from source to dest
+template <typename TyGroup, typename TyElem>
+_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
+                                      size_t count) {
+    const unsigned int rank = group.thread_rank();
+    const unsigned int stride = group.size();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        dst[idx] = src[idx];
+    }
+}
+
+template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    if (count == 0) {
+        return;
+    }
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    const unsigned int stride = group.size();
+    const unsigned int rank = group.thread_rank();
+    // Efficient copies require warps to operate on the same amount of work at each step.
+    // remainders are handled in a separate stage to prevent branching
+    const unsigned int subWarpMask = (stride - 1);
+    const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
+    const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
+
+    const size_t warpCopies = (count & (~subWarpMask));
+
+    for (size_t idx = 0; idx < warpCopies; idx += stride) {
+        size_t _srcIdx = rank + idx;
+        size_t _dstIdx = rank + idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+
+    if (subwarpCopies) {
+        size_t _srcIdx = warpCopies + maxSubwarpRank;
+        size_t _dstIdx = warpCopies + maxSubwarpRank;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    unsigned int stride = group.size();
+    unsigned int rank = group.thread_rank();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        size_t _srcIdx = idx;
+        size_t _dstIdx = idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <unsigned int MinAlignment, unsigned int MaxAlignment>
+_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
+    // Narrowing conversion intentional
+    uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
+
+    uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
+
+    // range [MaxAlignment, alignof(elem)], step: x >> 1
+    // over range of possible alignments, choose best available out of range
+    uint32_t out = MaxAlignment;
+#pragma unroll
+    for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
+        if (alignment & diff)
+            out = alignment;
+    }
+
+    return out;
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <typename TyType, typename TyGroup>
+_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                    size_t count) {
+    const char *src = reinterpret_cast<const char *>(_src);
+    char *dst = reinterpret_cast<char *>(_dst);
+
+    constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
+
+    uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
+
+    inline_copy(group, dst, src, alignOffset);
+    count -= alignOffset;
+    src += alignOffset;
+    dst += alignOffset;
+
+    // Copy using the best available alignment, async_copy expects n-datums, not bytes
+    size_t asyncCount = count / sizeof(TyType);
+    accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
+    asyncCount *= sizeof(TyType);
+
+    count -= asyncCount;
+    src += asyncCount;
+    dst += asyncCount;
+    inline_copy(group, dst, src, count);
+}
+
+// We must determine alignment and manually align src/dst ourselves
+template <size_t AlignHint>
+struct _memcpy_async_align_dispatch {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
+        uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
+
+        // Avoid copying the extra bytes if desired copy count is smaller
+        alignment = count < alignment ? AlignHint : alignment;
+
+        switch (alignment) {
+        default:
+        case 1:
+            inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
+            break;
+        case 2:
+            inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
+            break;
+        case 4:
+            copy_like<Segment<1>>(group, dst, src, count);
+            break;
+        case 8:
+            copy_like<Segment<2>>(group, dst, src, count);
+            break;
+        case 16:
+            copy_like<Segment<4>>(group, dst, src, count);
+            break;
+        }
+    }
+};
+
+// Specialization for 4 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<4> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
+        Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Specialization for 8 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<8> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
+        Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Alignments over 16 are truncated to 16 and bypass alignment
+// This is the highest performing memcpy available
+template <>
+struct _memcpy_async_align_dispatch<16> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
+        Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// byte-wide API
+template <size_t Alignment, class TyGroup>
+_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
+                                                                 const void *__restrict__ _src, size_t count) {
+    static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
+    details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
+}
+
+// Internal dispatch APIs
+// These deduce the alignments and sizes necessary to invoke the underlying copy engine
+template <typename Ty>
+using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
+
+template <typename Ty>
+using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_integral =
+    typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
+
+// byte-wide API using aligned_sized_t
+template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
+                                              const void *__restrict__ _src, const Alignment<Hint> &count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
+}
+
+// byte-wide API using type for aligment
+template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
+          enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
+}
+
+// byte-wide API with full alignment deduction required
+template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
+          enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
+}
+
+// 1d-datum API
+template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
+                                              const TyElem *__restrict__ src, const size_t srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+// 1d-datum API using aligned_size_t
+template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
+                                              const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+} // namespace details
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ */
+template <class TyGroup, typename TyElem, typename TySizeT>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
+                                       const TySizeT &count) {
+    details::_memcpy_async_bytes(group, _dst, _src, count);
+    __pipeline_commit();
+}
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ * Object counts are in datum sized chunks, not bytes.
+ */
+template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
+                                       const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
+    details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
+    __pipeline_commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
+    __pipeline_wait_prior(Stage);
+    group.sync();
+}
+
+/* Group wait all previously submitted memcpy_async to complete. */
+template <class TyGroup>
+_CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
+    __pipeline_wait_prior(0);
+    group.sync();
+}
+
+/***************** CG APIs including pipeline are deprecated *****************/
+
+/* Group submit batch of async-copy to cover of contiguous 1D array
+   to a pipeline and commit the batch*/
+template <class TyGroup, class TyElem>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
+                                       nvcuda::experimental::pipeline &pipe) {
+    details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
+    pipe.commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
+    pipe.wait_prior<Stage>();
+    group.sync();
+}
+
+/* Group wait for stage-S of memcpy_async to complete. */
+template <class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
+    pipe.wait(stage);
+    group.sync();
+}
+_CG_END_NAMESPACE
+
+#endif // _CG_ASYNC_H
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ba03fc9e4d0c78f07e3e5e1f97aff03e7a3d6f8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h
@@ -0,0 +1,95 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_COALESCED_REDUCE_H_
+#define _CG_COALESCED_REDUCE_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "coalesced_scan.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
+_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, 
+                                    TyVal&& val,
+                                    TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
+        out = op(out, group.shfl_xor(out, mask));
+    }
+
+    return out;
+}
+
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        // Full coalesced group can go through faster path by being treated as a tile of size 32
+        auto tile = details::tiled_partition_internal<32, void>();
+        return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    else {
+        auto scan_result =
+            inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
+        unsigned int last_thread_id = 31 - __clz(group_mask);
+        return details::tile::shuffle_dispatch<TyVal>::shfl(
+            _CG_STL_NAMESPACE::forward<TyVal>(scan_result), group_mask, last_thread_id, 32);
+    }
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_COALESCED_REDUCE_H_
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f151fe2c270421ba56e22935e84c4bf93790eff
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h
@@ -0,0 +1,212 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_FUNCTIONAL_H
+#define _CG_FUNCTIONAL_H
+
+#include "info.h"
+#include "helpers.h"
+
+#ifdef _CG_CPP11_FEATURES
+#ifdef _CG_USE_CUDA_STL
+# include <cuda/std/functional>
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+#ifdef _CG_USE_CUDA_STL
+    using cuda::std::plus;
+    using cuda::std::bit_and;
+    using cuda::std::bit_xor;
+    using cuda::std::bit_or;
+#else
+    template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
+    template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
+    template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
+    template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
+#endif // _CG_USE_PLATFORM_STL
+} // details
+
+template <typename Ty>
+struct plus : public details::plus<Ty> {};
+
+template <typename Ty>
+struct less {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg2 < arg1) ? arg2 : arg1;
+    }
+};
+
+template <typename Ty>
+struct greater {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg1 < arg2) ? arg2 : arg1;
+    }
+};
+
+template <typename Ty>
+struct bit_and : public details::bit_and<Ty> {};
+
+template <typename Ty>
+struct bit_xor : public details::bit_xor<Ty> {};
+
+template <typename Ty>
+struct bit_or : public details::bit_or<Ty> {};
+
+#if defined(_CG_HAS_STL_ATOMICS)
+namespace details {
+    template <class Ty>
+    using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
+
+    template <typename TyOp> struct _atomic_op_supported                                : public _CG_STL_NAMESPACE::false_type {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>>  : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
+
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
+        auto old = atomic.load(cuda::std::memory_order_relaxed);
+        while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed));
+        return old;
+    }
+
+    template<typename TyOp>
+    struct op_picker;
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::plus<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_add(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::less<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_min(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::greater<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_max(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_and<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_and(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_xor<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_xor(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_or<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_or(val, cuda::std::memory_order_relaxed);
+        }
+    };
+
+    template<bool atomic_supported>
+    struct atomic_update_dispatch {};
+
+    template<>
+    struct atomic_update_dispatch<false> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+            return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        }
+    };
+
+    template<>
+    struct atomic_update_dispatch<true> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
+            using dispatch = op_picker<details::remove_qual<TyOp>>;
+
+            return dispatch::atomic_update(atomic, val);
+        }
+    };
+
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+        using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
+
+        return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+
+    template<typename TyAtomic, typename TyVal>
+    _CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) {
+        atomic.store(val, cuda::std::memory_order_relaxed);
+    }
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif
+#endif //_CG_FUNCTIONAL_H
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..1485d9f503daa8d518af75775f7a7a415cb031d4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h
@@ -0,0 +1,693 @@
+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
+# define _COOPERATIVE_GROUPS_HELPERS_H_
+
+#include "info.h"
+#include "sync.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+#ifdef _CG_CPP11_FEATURES
+    template <typename Ty> struct _is_float_or_half          : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
+# ifdef _CG_HAS_FP16_COLLECTIVE
+    template <>            struct _is_float_or_half<__half>  : public _CG_STL_NAMESPACE::true_type {};
+    template <>            struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
+# endif
+    template <typename Ty>
+    using  is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
+
+    // Non-STL utility templates 
+    template <typename Ty>
+    using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
+
+    template <typename TyLhs, typename TyRhs>
+    using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
+    >;
+#endif
+
+    template <typename TyTrunc>
+    _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
+        return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
+               ((TyTrunc)index.y * nIndex.x) +
+                (TyTrunc)index.x;
+    }
+
+    namespace cta {
+
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            __barrier_sync(0);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return vec3_to_linear<unsigned int>(threadIdx, blockDim);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 group_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            return dim3(blockDim.x, blockDim.y, blockDim.z);
+        }
+
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned int size()
+        {
+            return num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_dim()
+        {
+            return dim_threads();
+        }
+
+    };
+
+    class _coalesced_group_data_access {
+    public:
+        // Retrieve mask of coalesced groups and tiles
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
+            return group.get_mask();
+        }
+
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
+            return TyGroup(mask);
+        }
+
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
+            group._data.coalesced.metaGroupRank = mgRank;
+            group._data.coalesced.metaGroupSize = mgSize;
+        }
+    };
+
+    namespace tile {
+        template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
+        struct _tile_helpers{
+            _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
+            _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
+            _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
+            _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
+        };
+
+        template <unsigned int> struct tile_helpers;
+        template <> struct tile_helpers<32> : public _tile_helpers<1,  0xFFFFFFFF, 0x1F, 5> {};
+        template <> struct tile_helpers<16> : public _tile_helpers<2,  0x0000FFFF, 0x0F, 4> {};
+        template <> struct tile_helpers<8>  : public _tile_helpers<4,  0x000000FF, 0x07, 3> {};
+        template <> struct tile_helpers<4>  : public _tile_helpers<8,  0x0000000F, 0x03, 2> {};
+        template <> struct tile_helpers<2>  : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
+        template <> struct tile_helpers<1>  : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
+
+#ifdef _CG_CPP11_FEATURES
+        namespace shfl {
+            /***********************************************************************************
+             * Recursively Sliced Shuffle
+             *  Purpose:
+             *      Slices an input type a number of times into integral types so that shuffles
+             *      are well defined
+             *  Expectations:
+             *      This object *should not* be used from a reinterpret_cast pointer unless
+             *      some alignment guarantees can be met. Use a memcpy to guarantee that loads
+             *      from the integral types stored within are aligned and correct.
+             **********************************************************************************/
+            template <unsigned int count, bool intSized = (count <= sizeof(int))>
+            struct recursive_sliced_shuffle_helper;
+
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, true> {
+                int val;
+
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                }
+            };
+
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, false> {
+                int val;
+                recursive_sliced_shuffle_helper<count - sizeof(int)> next;
+
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                    next.invoke_shuffle(shfl);
+                }
+            };
+        }
+
+        struct _memory_shuffle {
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B");
+                return TyElem{};
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+
+        /***********************************************************************************
+         * Intrinsic Device Function Shuffle
+         *  Purpose:
+         *      Uses a shuffle helper that has characteristics best suited for moving
+         *      elements between threads
+         *  Expectations:
+         *      Object given will be forced into an l-value type so that it can be used
+         *      with a helper structure that reinterprets the data into intrinsic compatible
+         *      types
+         *  Notes:
+         *      !! TyRet is required so that objects are returned by value and not as
+         *      dangling references depending on the value category of the passed object
+         **********************************************************************************/
+        struct _intrinsic_compat_shuffle {
+            template <unsigned int count>
+            using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
+
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
+                shfl_helper<sizeof(TyElem)> helper;
+                memcpy(&helper, &elem, sizeof(TyElem));
+                helper.invoke_shuffle(fn);
+                memcpy(&elem, &helper, sizeof(TyElem));
+                return elem;
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_sync(gMask, val, srcRank, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_down_sync(gMask, val, delta, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_up_sync(gMask, val, delta, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_xor_sync(gMask, val, lMask, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+
+        struct _native_shuffle {
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl(
+                    TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_down(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_up(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_xor(
+                    TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
+            }
+        };
+
+        // Almost all arithmetic types are supported by native shuffle
+        // Vector types are the exception
+        template <typename TyElem>
+        using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<
+                remove_qual<TyElem>>::value ||
+            details::is_float_or_half<
+                remove_qual<TyElem>>::value
+        >;
+
+        constexpr unsigned long long _MemoryShuffleCutoff = 32;
+
+        template <typename TyElem,
+                  bool IsNative = use_native_shuffle<TyElem>::value,
+                  bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
+        struct shuffle_dispatch;
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, true, false> :  public _native_shuffle {};
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, true> :  public _memory_shuffle {};
+
+#endif //_CG_CPP11_FEATURES
+    };
+
+    namespace multi_grid {
+        struct multi_grid_functions;
+    };
+
+    namespace grid {
+        _CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) {
+            return details::sync_grids_arrive(bar);
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) {
+            details::sync_grids_wait(token, bar);
+        }
+
+        _CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
+            unsigned int token = details::sync_grids_arrive(bar);
+            details::sync_grids_wait(token, bar);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_blocks()
+        {
+            // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
+            // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)]  exceeds 4b, promote before multiplication
+            return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long block_rank()
+        {
+            return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return dim3(gridDim.x, gridDim.y, gridDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            return dim3(blockIdx.x * blockDim.x + threadIdx.x,
+                        blockIdx.y * blockDim.y + threadIdx.y,
+                        blockIdx.z * blockDim.z + threadIdx.z);
+        }
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+        _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+            return __clusterGridDimInClusters();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+            const dim3 dimClusters = dim_clusters();
+            return dimClusters.x * dimClusters.y * dimClusters.z;
+        }
+
+        _CG_STATIC_QUALIFIER dim3 cluster_index() {
+            return __clusterIdx();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+            return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
+        }
+#endif
+
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned long long size()
+        {
+            return num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 grid_dim()
+        {
+            return dim_blocks();
+        }
+    };
+
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+    namespace multi_grid {
+        _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
+        {
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
+#else   /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */
+            return 0;
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+        }
+
+        _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
+        {
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaError_t err = cudaCGSynchronize(handle, 0);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
+        {
+            unsigned int numThreads = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetSize(&numThreads, NULL, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return numThreads;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
+        {
+            unsigned int threadRank = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetRank(&threadRank, NULL, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return threadRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
+        {
+            unsigned int gridRank = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetRank(NULL, &gridRank, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return gridRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
+        {
+            unsigned int numGrids = 0;
+#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
+            //this function is defined in device runtime library
+            //which requires separate compilation mode (__CUDACC_RDC__)
+            //or extended whole program mode (__CUDACC_EWP__)
+            cudaCGGetSize(NULL, &numGrids, handle);
+#endif  /* __CUDACC_RDC__ || __CUDACC_EWP__ */
+            return numGrids;
+        }
+
+# ifdef _CG_CPP11_FEATURES
+        struct multi_grid_functions {
+            decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
+            decltype(multi_grid::sync) *sync;
+            decltype(multi_grid::size) *size;
+            decltype(multi_grid::thread_rank) *thread_rank;
+            decltype(multi_grid::grid_rank) *grid_rank;
+            decltype(multi_grid::num_grids) *num_grids;
+        };
+
+        template <typename = void>
+        _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
+            __constant__ static const multi_grid_functions mgf {
+                &multi_grid::get_intrinsic_handle,
+                &multi_grid::sync,
+                &multi_grid::size,
+                &multi_grid::thread_rank,
+                &multi_grid::grid_rank,
+                &multi_grid::num_grids
+            };
+
+            return &mgf;
+        }
+# endif
+    };
+#endif
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    namespace cluster {
+
+        _CG_STATIC_QUALIFIER bool isReal()
+        {
+            return __clusterDimIsSpecified();
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_arrive()
+        {
+            __cluster_barrier_arrive();
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_wait()
+        {
+            __cluster_barrier_wait();
+        }
+
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            barrier_arrive();
+            barrier_wait();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+        {
+            return __cluster_query_shared_rank(addr);
+        }
+
+        template <typename T>
+        _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+        {
+            return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return __clusterRelativeBlockIdx();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int block_rank()
+        {
+            return __clusterRelativeBlockRank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            const dim3 blockIndex = block_index();
+            return dim3(blockIndex.x * blockDim.x + threadIdx.x,
+                        blockIndex.y * blockDim.y + threadIdx.y,
+                        blockIndex.z * blockDim.z + threadIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return __clusterDim();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_blocks()
+        {
+            return __clusterSizeInBlocks();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            const dim3 dimBlocks = dim_blocks();
+            const unsigned int x = dimBlocks.x * blockDim.x;
+            const unsigned int y = dimBlocks.y * blockDim.y;
+            const unsigned int z = dimBlocks.z * blockDim.z;
+            return dim3(x, y, z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+
+    };
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned int laneid()
+    {
+        unsigned int laneid;
+        asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
+        return laneid;
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
+    {
+        unsigned int lanemask32_eq;
+        asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
+        return (lanemask32_eq);
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
+    {
+        unsigned int lanemask32_lt;
+        asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
+        return (lanemask32_lt);
+    }
+
+    _CG_STATIC_QUALIFIER void abort()
+    {
+        _CG_ABORT();
+    }
+
+    template <typename Ty>
+    _CG_QUALIFIER void assert_if_not_arithmetic() {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(
+            _CG_STL_NAMESPACE::is_integral<Ty>::value ||
+            details::is_float_or_half<Ty>::value,
+            "Error: Ty is neither integer or float"
+        );
+#endif //_CG_CPP11_FEATURES
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
+        return x == 1 ? 0 : 1 + log2(x / 2);
+    }
+#endif //_CG_CPP11_FEATURES
+
+}; // !Namespace internal
+
+_CG_END_NAMESPACE
+
+#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..47cf260f3b4e0b29bf08c948697102bf027616db
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h
@@ -0,0 +1,135 @@
+/* Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_MEMORY_H_
+# define _COOPERATIVE_GROUPS_MEMORY_H_
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+#if defined(_CG_CPP11_FEATURES)
+namespace details {
+    _CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
+
+#if defined(_CG_HAS_RESERVED_SHARED)
+    _CG_STATIC_QUALIFIER void* reserved_shared_ptr()
+    {
+        void *ptr;
+        asm ("{\n\t"
+             " .reg .u32 start;\n\t"
+             " .reg .u64 extended;\n\t"
+             " mov.u32 start, %%reserved_smem_offset_1;\n\t"
+             " cvt.u64.u32 extended, start;\n\t"
+             " cvta.shared.u64 %0, extended;\n\t"
+             "}"
+             : "=" _CG_ASM_PTR_CONSTRAINT(ptr));
+        return ptr;
+    }
+#endif
+
+    struct multi_warp_scratch {
+        // One barrier per possible size of the group.
+        _CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
+        _CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
+
+        using communication_type = unsigned long long;
+        _CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
+
+        // Layout of the scratch space:
+        barrier_t barriers[memory_barriers_count];
+        char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
+        communication_type communication_memory[default_max_block_size / 32];
+
+        _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
+            // One slot of collectives memory per warp.
+            return scratch_num_reserved_bytes + sync_memory_size + max_block_size / 32 * communication_size;
+        }
+
+        _CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
+            if (thread_rank < memory_barriers_count) {
+                barriers[thread_rank] = 0;
+            }
+        }
+    };
+
+#if defined(_CG_HAS_RESERVED_SHARED)
+    // CG can expect at least 288 bytes available in reserved shared
+    static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
+#endif
+
+    // Make sure the structure can fit into the user provided memory
+    static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
+                  "multi-warp scratch size is too large");
+
+
+    _CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
+        void *ptr;
+#if defined(_CG_HAS_RESERVED_SHARED)
+        ptr = reserved_shared_ptr();
+#else
+        ptr = user_scratch;
+#endif
+        return static_cast<multi_warp_scratch*>(ptr);
+
+    }
+
+}
+
+template <unsigned int MaxBlockSize = details::default_max_block_size>
+struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
+private:
+#if !defined(_CG_HAS_RESERVED_SHARED)
+    char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
+#endif
+};
+#endif
+
+_CG_END_NAMESPACE
+
+#endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..0313b52a23f440e283509993d6f7997ba5df2365
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h
@@ -0,0 +1,419 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_REDUCE_H_
+#define _CG_REDUCE_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "coalesced_reduce.h"
+#include "functional.h"
+#include "cooperative_groups.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <class Ty>
+    using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
+
+    template <class Ty>
+    using redux_is_add_supported = _redux_is_add_supported<Ty>;
+
+    // A specialization for 64 bit logical operations is possible
+    // but for now only accelerate 32 bit bitwise ops
+    template <class Ty>
+    using redux_is_logical_supported = redux_is_add_supported<Ty>;
+
+    // Base operator support case
+    template <class TyOp, class Ty> struct _redux_op_supported                 : public _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_OP_REDUX
+    template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>,  Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+#endif
+
+    template <class Ty, template <class> class TyOp>
+    using redux_op_supported = _redux_op_supported<
+            typename details::remove_qual<TyOp<Ty>>,
+            Ty>;
+
+    // Groups smaller than 16 actually have worse performance characteristics when used with redux
+    // tiles of size 16 and 32 perform the same or better and have better code generation profiles
+    template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <>
+    struct _redux_group_optimized<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type  {};
+
+    template <typename TyGroup>
+    using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
+
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
+
+#ifdef _CG_HAS_OP_REDUX
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
+        return __reduce_or_sync(mask, val);
+    }
+
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
+        return __reduce_or_sync(mask, val);
+    }
+#endif
+
+
+    template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
+    struct _accelerated_op;
+
+    // Signed type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, false> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
+        }
+    };
+
+    // Unsigned type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, true> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
+        }
+    };
+
+    template <typename TyVal>
+    using accelerated_op = _accelerated_op<TyVal>;
+
+
+    template <typename TyVal, typename TyFnInput, typename TyGroup>
+    class _redux_dispatch {
+        template <class Ty, template <class> class TyOp>
+        using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
+            redux_op_supported<Ty, TyOp>::value &&
+            redux_group_optimized<TyGroup>::value>;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+    public:
+        // Dispatch to redux if the combination of op and args are supported
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+
+        // Fallback shuffle sync reduction
+        template <
+            template <class> class TyOp,
+            redux_is_not_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            //Dispatch to fallback shuffle sync accelerated reduction
+            return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+        }
+
+    };
+
+    // Group support for reduce.
+    template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _reduce_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+
+    template <typename TyVal, typename TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+        return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+
+    template <unsigned int GroupId>
+    struct tile_reduce_dispatch;
+
+    template <>
+    struct tile_reduce_dispatch<details::coalesced_group_id> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <>
+    struct tile_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                    *warp_scratch_location =
+                        details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            };
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    *thread_scratch_location =
+                        details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            };
+            return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+        }
+    };
+
+    template <unsigned int GroupId>
+    struct tile_async_reduce_dispatch;
+
+    template <>
+    struct tile_async_reduce_dispatch<details::coalesced_group_id> {
+        template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            // Do regular, in group reduction
+            auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+
+            // One thread stores/updates the destination
+            if (group.thread_rank() == 0) {
+                res_handler(result);
+            }
+        }
+    };
+
+    template <>
+    struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            using TyVal = remove_qual<TyInputVal>;
+            const unsigned int num_warps = TySize / 32;
+            details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
+            auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
+
+            // Do in warp reduce
+            auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
+            *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
+
+            // Tile of size num_warps from the last warp to arrive does final reduction step
+            if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
+                auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
+                if (subwarp.meta_group_rank() == 0) {
+                    auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
+                    auto thread_val = *thread_scratch_location;
+                    // Release other warps, we read their contribution already.
+                    subwarp.sync();
+                    details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
+                    TyVal result = details::reduce(subwarp, thread_val, op);
+                    // One thread stores the result or updates the atomic
+                    if (subwarp.thread_rank() == 0) {
+                        res_handler(result);
+                    }
+                }
+                warp.sync();
+            }
+        }
+    };
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_reduce_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    };
+
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_async_reduce_params() {
+        check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
+
+    using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
+    return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+#if defined(_CG_CPP11_FEATURES)
+
+# if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+# endif
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        *dst = result;
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_REDUCE_H_
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d68350e48307d120289e22872abc66f5188115
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h
@@ -0,0 +1,320 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_SCAN_H_
+#define _CG_SCAN_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "functional.h"
+#include "coalesced_scan.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    // Group support for scan.
+    template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _scan_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
+
+    template <bool IsIntegralPlus>
+    struct integral_optimized_scan;
+
+    enum class ScanType { exclusive, inclusive };
+
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            auto scan_result = coalesced_inclusive_scan(group, val, op);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group,
+                                                             scan_result,
+                                                             _CG_STL_NAMESPACE::forward<TyVal>(val),
+                                                             _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            return scan_result;
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <ScanType TyScan>
+    struct scan_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
+                    *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            if (warpType::meta_group_rank() == 0) {
+                return warp_scan;
+            }
+            else {
+                return op(warp_scan, previous_warps_sum);
+            }
+        }
+    };
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_update_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            details::remove_qual<TyVal> old;
+
+            // Do regular in group scan
+            auto scan_result = details::coalesced_inclusive_scan(group, val, op);
+
+            // Last thread updates the atomic and distributes its old value to other threads
+            if (group.thread_rank() == group.size() - 1) {                                                
+                old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            old = group.shfl(old, group.size() - 1);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            }
+            scan_result = op(old, scan_result);
+            return scan_result;
+        }
+    };
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
+                    TyRet offset;
+                    // Single thread does the atomic update with sum of all contributions and reads the old value.
+                    if (subwarp.thread_rank() == subwarp.size() - 1) {
+                        offset = details::atomic_update(dst, scan_result, op);
+                    }
+                    offset = subwarp.shfl(offset, subwarp.size() - 1);
+                    scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
+                    // Add offset read from the atomic to the scanned warp sum.
+                    // Skipping first thread, since it got defautly constructed value from the conversion,
+                    // it should just return the offset received from the thread that did the atomic update.
+                    if (subwarp.thread_rank() != 0) {
+                        offset = op(scan_result, offset);
+                    }
+                    *thread_scratch_location = offset;
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            return op(warp_scan, previous_warps_sum);
+        }
+    };
+#endif
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    }
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_update_params() {
+        check_scan_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+#endif
+
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
+    return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
+    return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+#if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_SCAN_H_
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h
new file mode 100644
index 0000000000000000000000000000000000000000..50b907d9a1fe45cdc411891a20d8fd035118e5be
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h
@@ -0,0 +1,62 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/async.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c87d780db0b437f1ae06e0ef8d60137233795c0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h
@@ -0,0 +1,63 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_REDUCE_H
+#define _COOPERATIVE_GROUPS_REDUCE_H
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/reduce.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+
+#endif //_COOPERATIVE_GROUPS_REDUCE_H
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bc27078028318ada00cbcccd052e0d6cc930cfe
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h
@@ -0,0 +1,63 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_SCAN_H
+#define _COOPERATIVE_GROUPS_SCAN_H
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/scan.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+
+#endif //_COOPERATIVE_GROUPS_SCAN_H
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/common_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/common_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7e70950fb51d0d58f8dd99239e6b36ba89c4779
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/common_functions.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
+#endif
+
+#if !defined(__COMMON_FUNCTIONS_H__)
+#define __COMMON_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+#define __CUDACC_VER__ "__CUDACC_VER__ is no longer supported.  Use __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, and __CUDACC_VER_BUILD__ instead."
+
+#ifndef __CUDA_API_VER_MAJOR__
+#define __CUDA_API_VER_MAJOR__ __CUDACC_VER_MAJOR__
+#endif /* __CUDA_API_VER_MAJOR__ */
+
+#ifndef __CUDA_API_VER_MINOR__
+#define __CUDA_API_VER_MINOR__ __CUDACC_VER_MINOR__
+#endif /* __CUDA_API_VER_MINOR__ */
+
+#if !defined(__CUDACC_RTC__)
+#include <string.h>
+#include <time.h>
+
+extern "C"
+{
+#endif /* !__CUDACC_RTC__ */
+extern _CRTIMP __host__ __device__ __device_builtin__ __cudart_builtin__ clock_t __cdecl clock(void)
+#if defined(__QNX__)
+asm("clock32")
+#endif
+__THROW;
+extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memset(void*, int, size_t) __THROW;
+extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memcpy(void*, const void*, size_t) __THROW;
+#if !defined(__CUDACC_RTC__)
+}
+#endif /* !__CUDACC_RTC__ */
+
+#if defined(__CUDA_ARCH__)
+
+#if defined(__CUDACC_RTC__)
+inline __host__ __device__ void* operator new(size_t, void *p) { return p; }
+inline __host__ __device__ void* operator new[](size_t, void *p) { return p; }
+inline __host__ __device__ void operator delete(void*, void*) { }
+inline __host__ __device__ void operator delete[](void*, void*) { }
+#else /* !__CUDACC_RTC__ */
+#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
+#include <new>
+#endif
+
+#if defined (__GNUC__)
+
+#define STD \
+        std::
+        
+#else /* __GNUC__ */
+
+#define STD
+
+#endif /* __GNUC__ */
+
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, void*) throw();
+# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
+#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__)  || defined(__CUDA_ICC_CPP14__) */
+#endif /* __CUDACC_RTC__ */
+
+#if !defined(__CUDACC_RTC__)
+#include <stdio.h>
+#include <stdlib.h>
+#endif /* !__CUDACC_RTC__ */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+extern "C"
+{
+extern
+#if !defined(_MSC_VER) || _MSC_VER < 1900
+_CRTIMP
+#endif
+            
+#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) 
+__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...) __THROW;
+#else /* newer glibc */
+__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...);
+#endif /* defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) */
+
+
+extern _CRTIMP __host__ __device__ __cudart_builtin__ void*   __cdecl malloc(size_t) __THROW;
+extern _CRTIMP __host__ __device__ __cudart_builtin__ void    __cdecl free(void*) __THROW;
+
+#if defined(_MSC_VER)
+extern  __host__ __device__ __cudart_builtin__ void*   __cdecl _alloca(size_t);
+#endif
+
+#if defined(__QNX__)
+#undef alloca
+#define alloca(__S) __builtin_alloca(__S)
+#endif
+}
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#include <assert.h>
+#endif /* !__CUDACC_RTC__ */
+
+extern "C"
+{
+#if defined(__CUDACC_RTC__)
+extern __host__ __device__ void __assertfail(const char * __assertion, 
+                                             const char *__file,
+                                             unsigned int __line,
+                                             const char *__function,
+                                             size_t charsize);
+#elif defined(__APPLE__)
+#define __builtin_expect(exp,c) (exp)
+extern __host__ __device__ __cudart_builtin__ void __assert_rtn(
+  const char *, const char *, int, const char *);
+#elif defined(__ANDROID__)
+extern __host__ __device__ __cudart_builtin__ void __assert2(
+  const char *, int, const char *, const char *);
+#elif defined(__QNX__)
+#if !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+extern __host__ __device__ __cudart_builtin__ void __assert(
+  const char *, const char *, unsigned int, const char *);
+#if !defined(_LIBCPP_VERSION)
+}
+#endif
+#elif defined(__HORIZON__)
+extern __host__ __device__ __cudart_builtin__ void __assert_fail(
+  const char *, const char *, int, const char *);
+#elif defined(__GNUC__)
+extern __host__ __device__ __cudart_builtin__ void __assert_fail(
+  const char *, const char *, unsigned int, const char *)
+  __THROW; 
+#elif defined(_WIN32)
+extern __host__ __device__ __cudart_builtin__ _CRTIMP void __cdecl _wassert(
+  const wchar_t *, const wchar_t *, unsigned);
+#endif
+}
+
+#if defined(__CUDACC_RTC__)
+#ifdef NDEBUG
+#define assert(e) (static_cast<void>(0))
+#else /* !NDEBUG */
+#define __ASSERT_STR_HELPER(x) #x
+#define assert(e) ((e) ? static_cast<void>(0)\
+                       : __assertfail(__ASSERT_STR_HELPER(e), __FILE__,\
+                                      __LINE__, __PRETTY_FUNCTION__,\
+                                      sizeof(char)))
+#endif /* NDEBUG */
+__host__ __device__  void* operator new(size_t);
+__host__ __device__  void* operator new[](size_t);
+__host__ __device__  void operator delete(void*);
+__host__ __device__  void operator delete[](void*);
+# if __cplusplus >= 201402L
+__host__ __device__  void operator delete(void*, size_t);
+__host__ __device__  void operator delete[](void*, size_t);
+#endif /* __cplusplus >= 201402L */
+
+#if __cplusplus >= 201703L
+namespace std { enum class align_val_t : size_t {}; }
+__host__ __device__ void*   __cdecl operator new(size_t sz, std::align_val_t) noexcept;
+__host__ __device__ void*   __cdecl operator new[](size_t sz, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete(void* ptr, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete[](void* ptr, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete(void* ptr, size_t, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete[](void* ptr, size_t, std::align_val_t) noexcept;
+#endif  /* __cplusplus >= 201703L */
+
+#else /* !__CUDACC_RTC__ */
+#if defined (__GNUC__)
+
+#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 
+
+#if (__cplusplus >= 201103L)  && ((!(defined(__QNX__) && defined(_LIBCPP_VERSION))) || (defined(__QNX__) && __NV_GLIBCXX_VERSION >= 80300))
+#define THROWBADALLOC 
+#else
+#if defined(__ANDROID__) && !defined(_LIBCPP_VERSION) && (defined(__BIONIC__) || __NV_GLIBCXX_VERSION < 40900)
+#define THROWBADALLOC
+#else
+#define THROWBADALLOC  throw(STD bad_alloc)
+#endif
+#endif
+#define __DELETE_THROW throw()
+
+#undef __NV_GLIBCXX_VERSION
+
+#else /* __GNUC__ */
+
+#define THROWBADALLOC  throw(...)
+
+#endif /* __GNUC__ */
+
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t) THROWBADALLOC;
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t) THROWBADALLOC;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*) throw();
+# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
+#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)  */
+
+#if __cpp_aligned_new
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, std::align_val_t);
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, std::align_val_t);
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t, std::align_val_t) noexcept;
+#endif  /* __cpp_aligned_new */
+
+#undef THROWBADALLOC
+#undef STD
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDA_ARCH__ */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__) && (__CUDA_ARCH__ >= 350)
+#include "cuda_device_runtime_api.h"
+#endif
+
+#include "math_functions.h"
+
+#endif /* !__COMMON_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d244463e73f0f7569a4707002c8e059bca67c6d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2021-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/cudacc_ext.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/cudacc_ext.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
+#endif
+
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7849c6c6e099e85a4676e7c9c38c05b5a5b02d26
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h
@@ -0,0 +1,1192 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_double_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_double_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
+#endif
+
+#if !defined(__DEVICE_DOUBLE_FUNCTIONS_H__)
+#define __DEVICE_DOUBLE_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
+#else
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+extern "C"
+{
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a double as a 64-bit signed integer.
+ *
+ * Reinterpret the bits in the double-precision floating-point value \p x
+ * as a signed 64-bit integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ long long int         __double_as_longlong(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a 64-bit signed integer as a double.
+ *
+ * Reinterpret the bits in the 64-bit signed integer value \p x as
+ * a double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                __longlong_as_double(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-to-nearest-even mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rn(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-towards-zero mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rz(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-up mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_ru(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-down mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rd(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-up mode.
+ * 
+ * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dadd_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-down mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-up mode.
+ * 
+ * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dsub_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-down mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-up mode.
+ * 
+ * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-down mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed int to a double.
+ *
+ * Convert the signed integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __int2double_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned int to a double.
+ *
+ * Convert the unsigned integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __uint2double_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-up mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-down mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-up mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-down mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the high 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2hiint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret low 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the low 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2loint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high and low 32-bit integer values as a double.
+ *
+ * Reinterpret the integer value of \p hi as the high 32 bits of a 
+ * double-precision floating-point value and the integer value of \p lo
+ * as the low 32 bits of the same double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                 __hiloint2double(int hi, int lo);
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode);
+
+#undef EXCLUDE_FROM_RTC
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode = cudaRoundZero);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode = cudaRoundNearest);
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode = cudaRoundNearest);
+
+#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
+
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#if !defined(__CUDACC_RTC__)
+#include "device_double_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__DEVICE_DOUBLE_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f63063689d65c4a1dffb9a823ddaf6a5b353cba3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__DEVICE_DOUBLE_FUNCTIONS_HPP__)
+#define __DEVICE_DOUBLE_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
+#else
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __fma_rz(a, b, c) :
+         mode == cudaRoundPosInf ? __fma_ru(a, b, c) :
+         mode == cudaRoundMinInf ? __fma_rd(a, b, c) :
+                                   __fma_rn(a, b, c);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dmul_rz(a, b) :
+         mode == cudaRoundPosInf ? __dmul_ru(a, b) :
+         mode == cudaRoundMinInf ? __dmul_rd(a, b) :
+                                   __dmul_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dadd_rz(a, b) :
+         mode == cudaRoundPosInf ? __dadd_ru(a, b) :
+         mode == cudaRoundMinInf ? __dadd_rd(a, b) :
+                                   __dadd_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dsub_rz(a, b) :
+         mode == cudaRoundPosInf ? __dsub_ru(a, b) :
+         mode == cudaRoundMinInf ? __dsub_rd(a, b) :
+                                   __dsub_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2int_rn(a) :
+         mode == cudaRoundPosInf  ? __double2int_ru(a) :
+         mode == cudaRoundMinInf  ? __double2int_rd(a) :
+                                    __double2int_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2uint_rn(a) :
+         mode == cudaRoundPosInf  ? __double2uint_ru(a) :
+         mode == cudaRoundMinInf  ? __double2uint_rd(a) :
+                                    __double2uint_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2ll_rn(a) :
+         mode == cudaRoundPosInf  ? __double2ll_ru(a) :
+         mode == cudaRoundMinInf  ? __double2ll_rd(a) :
+                                    __double2ll_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2ull_rn(a) :
+         mode == cudaRoundPosInf  ? __double2ull_ru(a) :
+         mode == cudaRoundMinInf  ? __double2ull_rd(a) :
+                                    __double2ull_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __ll2double_rz(a) :
+         mode == cudaRoundPosInf ? __ll2double_ru(a) :
+         mode == cudaRoundMinInf ? __ll2double_rd(a) :
+                                   __ll2double_rn(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __ull2double_rz(a) :
+         mode == cudaRoundPosInf ? __ull2double_ru(a) :
+         mode == cudaRoundMinInf ? __ull2double_rd(a) :
+                                   __ull2double_rn(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__DEVICE_DOUBLE_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c905aea7c376cd4426e16f84a93876d9b212bf16
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.h
@@ -0,0 +1,3696 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__
+#endif
+
+#if !defined(__DEVICE_FUNCTIONS_H__)
+#define __DEVICE_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
+#else
+#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __cudart_builtin__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+extern "C"
+{
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 32 bits of the product of the two 32-bit integers.
+ *
+ * Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
+ * are 32-bit integers.
+ *
+ * \return Returns the most significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __mulhi(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 32 bits of the product of the two 32-bit unsigned integers.
+ *
+ * Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
+ * are 32-bit unsigned integers. 
+ *
+ * \return Returns the most significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __umulhi(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 64 bits of the product of the two 64-bit integers.
+ *
+ * Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
+ * are 64-bit integers. 
+ *
+ * \return Returns the most significant 64 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __mul64hi(long long int x, long long int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 64 bits of the product of the two 64 unsigned bit integers.
+ *
+ * Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
+ * are 64-bit unsigned integers. 
+ *
+ * \return Returns the most significant 64 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in an integer as a float.
+ *
+ * Reinterpret the bits in the signed integer value \p x as a single-precision
+ * floating-point value.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int_as_float(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a float as a signed integer.
+ *
+ * Reinterpret the bits in the single-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float_as_int(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in an unsigned integer as a float.
+ *
+ * Reinterpret the bits in the unsigned integer value \p x as a single-precision
+ * floating-point value.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint_as_float(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a float as a unsigned integer.
+ *
+ * Reinterpret the bits in the single-precision floating-point value \p x
+ * as a unsigned integer.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float_as_uint(float x);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __syncthreads(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __prof_trigger(int);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __threadfence(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __threadfence_block(void);
+__DEVICE_FUNCTIONS_DECL__ 
+#if defined(__GNUC__) || defined(__CUDACC_RTC__)
+__attribute__((__noreturn__))
+#elif defined(_MSC_VER)
+__declspec(noreturn)
+#endif  /* defined(__GNUC__) || defined(__CUDACC_RTC__) */
+__device_builtin__ void                   __trap(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __brkpt();
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Clamp the input argument to [+0.0, 1.0].
+ *
+ * Clamp the input argument \p x to be within the interval [+0.0, 1.0].
+ * \return 
+ * - __saturatef(\p x) returns 0 if \p x < 0.
+ * - __saturatef(\p x) returns 1 if \p x > 1.
+ * - __saturatef(\p x) returns \p x if 
+ * \latexonly $0 \le x \le 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mn>0</m:mn>
+ *   <m:mo>&#x2264;<!-- &Le --></m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2264;<!-- &Le --></m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - __saturatef(NaN) returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __saturatef(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate 
+ * \latexonly $|x - y| + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , the sum of absolute difference.
+ *
+ * Calculate 
+ * \latexonly $|x - y| + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , the 32-bit sum of the third argument \p z plus and the absolute 
+ * value of the difference between the first argument, \p x, and second 
+ * argument, \p y.
+ * 
+ * Inputs \p x and \p y are signed 32-bit integers, input \p z is 
+ * a 32-bit unsigned integer.
+ *
+ * \return Returns 
+ * \latexonly $|x - y| + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __sad(int x, int y, unsigned int z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate 
+ * \latexonly $|x - y| + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , the sum of absolute difference.
+ *
+ * Calculate 
+ * \latexonly $|x - y| + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , the 32-bit sum of the third argument \p z plus and the absolute 
+ * value of the difference between the first argument, \p x, and second 
+ * argument, \p y.
+ * 
+ * Inputs \p x, \p y, and \p z are unsigned 32-bit integers.
+ * 
+ * \return Returns 
+ * \latexonly $|x - y| + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __usad(unsigned int x, unsigned int y, unsigned int z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the least significant 32 bits of the product of the least significant 24 bits of two integers.
+ *
+ * Calculate the least significant 32 bits of the product of the least significant 24 bits of \p x and \p y.
+ * The high order 8 bits of \p x and \p y are ignored.
+ *
+ * \return Returns the least significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __mul24(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the least significant 32 bits of the product of the least significant 24 bits of two unsigned integers.
+ *
+ * Calculate the least significant 32 bits of the product of the least significant 24 bits of \p x and \p y.
+ * The high order 8 bits of  \p x and  \p y are ignored. 
+ *
+ * \return Returns the least significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __umul24(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Divide two floating-point values.
+ *
+ * Compute \p x divided by \p y.  If <tt>--use_fast_math</tt> is specified,
+ * use ::__fdividef() for higher performance, otherwise use normal division.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fdividef(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate division of the input arguments.
+ *
+ * Calculate the fast approximate division of \p x by \p y.
+ *
+ * \return Returns \p x / \p y.
+ * - __fdividef(
+ * \latexonly $\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns NaN for 
+ * \latexonly $2^{126} < |y| < 2^{128}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>126</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:mi>|y|</m:mi>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>128</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - __fdividef(\p x, \p y) returns 0 for 
+ * \latexonly $2^{126} < |y| < 2^{128}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>126</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:mi>|y|</m:mi>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>128</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  and finite
+ * \latexonly $x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdividef(float x, float y);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fdivide(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate sine of the input argument.
+ *
+ * Calculate the fast approximate sine of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate sine of \p x.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note Output in the denormal range is flushed to sign preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __sinf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate cosine of the input argument.
+ *
+ * Calculate the fast approximate cosine of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate cosine of \p x.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __cosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate tangent of the input argument.
+ *
+ * Calculate the fast approximate tangent of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate tangent of \p x.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note The result is computed as the fast divide of ::__sinf()
+ * by ::__cosf(). Denormal output is flushed to sign-preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __tanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate of sine and cosine of the first input argument.
+ *
+ * Calculate the fast approximate of sine and cosine of the first input argument \p x (measured
+ * in radians). The results for sine and cosine are written into the second 
+ * argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \return
+ * - none
+ *
+ * \note_accuracy_single_intrinsic
+ * \note Denorm input/output is flushed to sign preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ void                   __sincosf(float x, float *sptr, float *cptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument.
+ *
+ * Calculate the fast approximate base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument \p x, 
+ * \latexonly $e^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return Returns an approximation to 
+ * \latexonly $e^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __expf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 10 exponential of the input argument.
+ *
+ * Calculate the fast approximate base 10 exponential of the input argument \p x, 
+ * \latexonly $10^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>10</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return Returns an approximation to 
+ * \latexonly $10^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>10</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __exp10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 2 logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 2 logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \latexonly $\log_2(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>log</m:mi>
+ *     <m:mn>2</m:mn>
+ *   </m:msub>
+ *   <m:mo>&#x2061;<!-- &functionAplication --></m:mo>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __log2f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 10 logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 10 logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \latexonly $\log_{10}(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>log</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>10</m:mn>
+ *     </m:mrow>
+ *   </m:msub>
+ *   <m:mo>&#x2061;<!-- &functionAplication --></m:mo>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __log10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \latexonly $\log_e(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>log</m:mi>
+ *     <m:mi>e</m:mi>
+ *   </m:msub>
+ *   <m:mo>&#x2061;<!-- &functionAplication --></m:mo>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __logf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate of 
+ * \latexonly $x^y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the fast approximate of \p x, the first input argument, 
+ * raised to the power of \p y, the second input argument, 
+ * \latexonly $x^y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return Returns an approximation to 
+ * \latexonly $x^y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __powf(float x, float y) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_ru(float);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-towards-zero mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rz(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-up mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_ru(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-down mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rd(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-towards-zero mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rz(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-up mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_ru(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-down mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rd(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-towards-zero mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-up mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-down mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-towards-zero mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-up mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-down mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the sum of \p x and \p y in round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the sum of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-up mode.
+ * 
+ * Compute the sum of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-down mode.
+ * 
+ * Compute the sum of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the difference of \p x and \p y in round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the difference of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-up mode.
+ * 
+ * Compute the difference of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-down mode.
+ * 
+ * Compute the difference of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the product of \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the product of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-up mode.
+ * 
+ * Compute the product of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-down mode.
+ * 
+ * Compute the product of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_single
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation, in round-to-nearest-even mode.
+ * 
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rn(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation, in round-towards-zero mode.
+ * 
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rz(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation, in round-up mode.
+ * 
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_ru(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation, in round-down mode.
+ * 
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rd(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-towards-zero mode.
+ * 
+ * Compute the reciprocal of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-up mode.
+ * 
+ * Compute the reciprocal of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-down mode.
+ * 
+ * Compute the reciprocal of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-towards-zero mode.
+ * 
+ * Compute the square root of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-up mode.
+ * 
+ * Compute the square root of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-down mode.
+ * 
+ * Compute the square root of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute
+ * \latexonly $1/\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mn>1</m:mn>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns
+ * \latexonly $1/\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mn>1</m:mn>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frsqrt_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-to-nearest-even mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rn(float x, float y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-towards-zero mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-up mode.
+ * 
+ * Divide two floating-point values \p x by \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-down mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_single
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Return the number of consecutive high-order zero bits in a 32-bit integer.
+ *
+ * Count the number of consecutive leading zero bits, starting at the most significant bit (bit 31) of \p x.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the number of zero bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __clz(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the least significant bit set to 1 in a 32-bit integer.
+ *
+ * Find the position of the first (least significant) bit set to 1 in \p x, where the least significant
+ * bit position is 1. 
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the position of the first bit set.
+ * - __ffs(0) returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __ffs(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of bits that are set to 1 in a 32-bit integer.
+ *
+ * Count the number of bits that are set to 1 in \p x.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the number of set bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __popc(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the bit order of a 32-bit unsigned integer.
+ *
+ * Reverses the bit order of the 32-bit unsigned integer \p x.
+ *
+ * \return Returns the bit-reversed value of \p x. i.e. bit N of the return value corresponds to bit 31-N of \p x.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __brev(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of consecutive high-order zero bits in a 64-bit integer.
+ *
+ * Count the number of consecutive leading zero bits, starting at the most significant bit (bit 63) of \p x.
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the number of zero bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __clzll(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the least significant bit set to 1 in a 64-bit integer.
+ *
+ * Find the position of the first (least significant) bit set to 1 in \p x, where the least significant
+ * bit position is 1. 
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the position of the first bit set.
+ * - __ffsll(0) returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __ffsll(long long int x);
+
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of bits that are set to 1 in a 64-bit integer.
+ *
+ * Count the number of bits that are set to 1 in \p x.
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the number of set bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __popcll(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the bit order of a 64-bit unsigned integer.
+ *
+ * Reverses the bit order of the 64-bit unsigned integer \p x.
+ *
+ * \return Returns the bit-reversed value of \p x. i.e. bit N of the return value corresponds to bit 63-N of \p x.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __brevll(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Return selected bytes from two 32-bit unsigned integers.
+ *
+ * \return Returns a 32-bit integer consisting of four bytes from eight input bytes provided in the two
+ * input integers \p x and \p y, as specified by a selector, \p s.
+ *
+ * Create 8-byte source
+ * - uint64_t \p tmp64 = ((uint64_t)\p y << 32) | \p x;
+ *
+ * Extract selector bits
+ * - \p selector0 = (\p s >>  0) & 0x7;
+ * - \p selector1 = (\p s >>  4) & 0x7;
+ * - \p selector2 = (\p s >>  8) & 0x7;
+ * - \p selector3 = (\p s >> 12) & 0x7;
+ *
+ * Return 4 selected bytes from 8-byte source:
+ * - \p res[07:00] = \p tmp64[\p selector0];
+ * - \p res[15:08] = \p tmp64[\p selector1];
+ * - \p res[23:16] = \p tmp64[\p selector2];
+ * - \p res[31:24] = \p tmp64[\p selector3];
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __byte_perm(unsigned int x, unsigned int y, unsigned int s);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute average of signed input arguments, avoiding overflow
+ * in the intermediate sum.
+ *
+ * Compute average of signed input arguments \p x and \p y 
+ * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum.
+ *
+ * \return Returns a signed integer value representing the signed 
+ * average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __hadd(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute rounded average of signed input arguments, avoiding
+ * overflow in the intermediate sum.
+ *
+ * Compute average of signed input arguments \p x and \p y 
+ * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate
+ * sum.
+ *
+ * \return Returns a signed integer value representing the signed 
+ * rounded average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __rhadd(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute average of unsigned input arguments, avoiding overflow
+ * in the intermediate sum.
+ *
+ * Compute average of unsigned input arguments \p x and \p y 
+ * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum.
+ *
+ * \return Returns an unsigned integer value representing the unsigned 
+ * average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __uhadd(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute rounded average of unsigned input arguments, avoiding
+ * overflow in the intermediate sum.
+ *
+ * Compute average of unsigned input arguments \p x and \p y 
+ * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate
+ * sum.
+ *
+ * \return Returns an unsigned integer value representing the unsigned 
+ * rounded average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __urhadd(unsigned int x, unsigned int y);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __double2int_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __double2uint_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __double2ll_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __double2ull_rz(double x);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm0(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm1(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm2(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm3(void);
+
+/*******************************************************************************
+ *                                                                             *
+ *                        FP16 SIMD functions                                  *
+ *                                                                             *
+ *******************************************************************************/
+
+ //  #include "fp16.h"
+
+
+/*******************************************************************************
+ *                                                                             *
+ *                                SIMD functions                               *
+ *                                                                             *
+ *******************************************************************************/
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute value.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes,
+ * then computes absolute value for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute value with signed saturation.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes,
+ * then computes absolute value with signed saturation for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed addition, with wrap-around: a + b
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs unsigned addition on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword addition with signed saturation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs addition with signed saturation on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword addition with unsigned saturation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs addition with unsigned saturation on corresponding parts.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed rounded average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes signed rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned rounded average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes unsigned rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes unsigned average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if they are equal, and 0000 otherwise.
+ * For example __vcmpeq2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a >= b ? 0xffff : 0.
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise.
+ * For example __vcmpges2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a >= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise.
+ * For example __vcmpgeu2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a > b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise.
+ * For example __vcmpgts2(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xffff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a > b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise.
+ * For example __vcmpgtu2(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xffff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a <= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise.
+ * For example __vcmples2(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xffff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a <= b ? 0xffff : 0.
+ *
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise.
+ * For example __vcmpleu2(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xffff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a < b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise.
+ * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a < b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise.
+ * For example __vcmpltu2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: a != b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part != 'b' part, and 0000 otherwise.
+ * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword absolute difference of unsigned integer computation: |a - b|
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed maximum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes signed maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned maximum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes unsigned maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed minimum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes signed minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned minimum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes unsigned minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part == 'b' part.
+ * If both equalities are satisfied, function returns 1.
+ * \return Returns 1 if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned minimum unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned minimum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part != 'b' part.
+ * If both conditions are satisfied, function returns 1.
+ * \return Returns 1 if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword sum of abs diff of unsigned.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute differences and returns
+ * sum of those differences.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed subtraction, with wrap-around.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed subtraction, with signed saturation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction with signed saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword subtraction with unsigned saturation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction with unsigned saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword negation.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword negation with signed saturation.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword sum of absolute difference of signed integer.
+ *
+ * Splits 4 bytes of each into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword sum of absolute difference of signed.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference and sum it up.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute value.
+ *
+ * Splits argument by bytes. Computes absolute value of each byte.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute value with signed saturation.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte,
+ * then computes absolute value with signed saturation for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed addition.
+ *
+ * Splits 'a' into 4 bytes, then performs unsigned addition on each of these
+ * bytes with the corresponding byte from 'b', ignoring overflow.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte addition with signed saturation.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte,
+ * then performs addition with signed saturation on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss4 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte addition with unsigned saturation.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte,
+ * then performs addition with unsigned saturation on corresponding parts.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus4 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed rounded average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes signed rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned rounded average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes unsigned rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes unsigned average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if they are equal, and 00 otherwise.
+ * For example __vcmpeq4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise.
+ * For example __vcmpges4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise.
+ * For example __vcmpgeu4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise.
+ * For example __vcmpgts4(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise.
+ * For example __vcmpgtu4(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise.
+ * For example __vcmples4(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise.
+ * For example __vcmpleu4(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise.
+ * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise.
+ * For example __vcmpltu4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part != 'b' part, and 00 otherwise.
+ * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute difference of unsigned integer.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed maximum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes signed maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned maximum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes unsigned maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed minimum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes signed minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned minimum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes unsigned minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part == 'b' part.
+ * If both equalities are satisfied, function returns 1.
+ * \return Returns 1 if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 part, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part != 'b' part.
+ * If both conditions are satisfied, function returns 1.
+ * \return Returns 1 if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte sum of abs difference of unsigned.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute differences and returns
+ * sum of those differences.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction with signed saturation.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction with signed saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction with unsigned saturation.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction with unsigned saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte negation.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte negation with signed saturation.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute difference of signed integer.
+ *
+ * Splits 4 bytes of each into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte sum of abs difference of signed.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference and sum it up.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), 0)
+ *
+ * Calculates the maximum of \p a and \p b of two signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a max with relu ( = max(a_part, b_part, 0) ). Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(a, b), 0)
+ *
+ * Calculates the minimum of \p a and \p b of two signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin_s32_relu(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(a, b), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a min with relu ( = max(min(a_part, b_part), 0) ). Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), c)
+ * 
+ * Calculates the 3-way max of signed integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimax3_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a 3-way max ( = max(max(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), c)
+ * 
+ * Calculates the 3-way max of unsigned integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a 3-way max ( = max(max(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(min(a, b), c)
+ * 
+ * Calculates the 3-way min of signed integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin3_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(min(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a 3-way min ( = min(min(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(min(a, b), c)
+ * 
+ * Calculates the 3-way min of unsigned integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(min(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a 3-way min ( = min(min(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(max(a, b), c), 0)
+ *
+ * Calculates the maximum of three signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimax3_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(max(a, b), c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a three-way max with relu ( = max(a_part, b_part, c_part, 0) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(min(a, b), c), 0)
+ *
+ * Calculates the minimum of three signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin3_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(min(a, b), c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a three-way min with relu ( = max(min(a_part, b_part, c_part), 0) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a + b, c)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the max with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmax_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add and compare: max(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a + b, c)
+ *
+ * Calculates the sum of unsigned integers \p a and \p b and takes the max with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs an add and compare: max(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a + b, c)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the min with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmin_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add and compare: min(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a + b, c)
+ *
+ * Calculates the sum of unsigned integers \p a and \p b and takes the min with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs an add and compare: min(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a + b, c), 0)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the max with \p c.
+ * If the result is less than \p 0 then \0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmax_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a + b, c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add, followed by a max with relu: max(max(a_part + b_part), c_part), 0)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(a + b, c), 0)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the min with \p c.
+ * If the result is less than \p 0 then \0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmin_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(a + b, c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add, followed by a min with relu: max(min(a_part + b_part), c_part), 0)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a, b), also sets the value pointed to by pred to (a >= b).
+ *
+ * Calculates the maximum of \p a and \p b of two signed ints. Also sets the value pointed to by \p pred to the value (a >= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vibmax_s32(const int a, const int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a, b), also sets the value pointed to by pred to (a >= b).
+ *
+ * Calculates the maximum of \p a and \p b of two unsigned ints. Also sets the value pointed to by \p pred to the value (a >= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a, b), also sets the value pointed to by pred to (a <= b).
+ *
+ * Calculates the minimum of \p a and \p b of two signed ints. Also sets the value pointed to by \p pred to the value (a <= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vibmin_s32(const int a, const int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a, b), also sets the value pointed to by pred to (a <= b).
+ *
+ * Calculates the minimum of \p a and \p b of two unsigned ints. Also sets the value pointed to by \p pred to the value (a <= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a >= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part >= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part >= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a >= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part >= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part >= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a <= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part <= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part <= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a <= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part <= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part <= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/*******************************************************************************
+ *                                                                             *
+ *                            END SIMD functions                               *
+ *                                                                             *
+ *******************************************************************************/
+} //extern "c"
+#undef EXCLUDE_FROM_RTC
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __DEVICE_FUNCTIONS_STATIC_DECL__
+#undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#include "device_functions.hpp"
+#endif /* !defined(__CUDACC_RTC__) */
+
+#include "device_atomic_functions.h"
+#include "device_double_functions.h"
+#include "sm_20_atomic_functions.h"
+#include "sm_32_atomic_functions.h"
+#include "sm_35_atomic_functions.h"
+#include "sm_60_atomic_functions.h"
+#include "sm_20_intrinsics.h"
+#include "sm_30_intrinsics.h"
+#include "sm_32_intrinsics.h"
+#include "sm_35_intrinsics.h"
+#include "sm_61_intrinsics.h"
+#include "sm_70_rt.h"
+#include "sm_80_rt.h"
+#include "sm_90_rt.h"
+#ifndef __CUDACC_RTC_MINIMAL__
+#include "texture_indirect_functions.h"
+#include "surface_indirect_functions.h"
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#include "cudacc_ext.h"
+
+#ifdef __CUDACC__
+extern "C" __host__ __device__  unsigned CUDARTAPI __cudaPushCallConfiguration(dim3 gridDim,
+                                      dim3 blockDim, 
+                                      size_t sharedMem = 0, 
+                                      struct CUstream_st *stream = 0);
+
+#if !defined(__CUDACC_RTC__) &&!defined(__NV_LEGACY_LAUNCH)
+extern "C" cudaError_t CUDARTAPI __cudaGetKernel(cudaKernel_t *, const void *);
+
+extern "C"  cudaError_t CUDARTAPI __cudaLaunchKernel(
+        cudaKernel_t kernel,
+        dim3 gridDim,
+        dim3 blockDim,
+        void **args,
+        size_t sharedMem,
+        cudaStream_t stream
+);
+
+extern "C" cudaError_t CUDARTAPI __cudaLaunchKernel_ptsz(
+        cudaKernel_t kernel,
+        dim3 gridDim,
+        dim3 blockDim,
+        void **args,
+        size_t sharedMem,
+        cudaStream_t stream
+);
+
+//referenced from compiler generated kernel launch code
+static inline cudaError_t __cudaLaunchKernel_helper(
+                                  cudaKernel_t kernel,
+                                  dim3 gridDim,
+                                  dim3 blockDim,
+                                  void **args,
+                                  size_t sharedMem,
+                                  cudaStream_t stream)
+{
+#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+  return __cudaLaunchKernel_ptsz(kernel, gridDim, blockDim, args, sharedMem,
+                                 stream);
+#else  /* !__CUDART_API_PER_THREAD_DEFAULT_STREAM */
+  return __cudaLaunchKernel(kernel, gridDim, blockDim, args, sharedMem,
+                            stream);
+#endif  /* __CUDART_API_PER_THREAD_DEFAULT_STREAM */
+}
+#endif  /* !defined(__CUDACC_RTC__) && !defined(__NV_LEGACY_LAUNCH) */
+
+
+
+#endif  /* __CUDACC__ */
+
+#endif /* !__DEVICE_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__
+#endif
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..88aa76f3cab6c57de39827d88435817171966989
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp
@@ -0,0 +1,1197 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__DEVICE_FUNCTIONS_HPP__)
+#define __DEVICE_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_FUNCTIONS_DECL__ __device__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
+#else
+#define __DEVICE_FUNCTIONS_DECL__ __device__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __DEVICE_FUNCTIONS_STATIC_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#ifdef __CUDACC__
+# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __CUDA_AND_AT_LEAST_SM_90__
+#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) */
+# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+#define __CUDA_AND_AT_LEAST_SM_70__
+#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) */
+# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
+#define __CUDA_AND_AT_LEAST_SM_75__
+#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) */
+#endif /* __CUDACC__ */
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm("{max.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = max(a, b);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm("{max.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxs2(__vmaxs2(a, b), 0U);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  // Get answer
+  short ansS_lo = (short)max(aS_lo, bS_lo);
+  short ansS_hi = (short)max(aS_hi, bS_hi);
+
+  // relu
+  if(ansS_lo < 0){ ansS_lo = 0; }
+  if(ansS_hi < 0){ ansS_hi = 0; }
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin_s32_relu(const int a, const int b){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm("{min.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+  return res;
+#else
+  // Host and older architecture code
+    int ans = min(a, b);
+    
+    return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm("{min.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxs2(__vmins2(a, b), 0U);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  // Get answer
+  short ansS_lo = (short)min(aS_lo, bS_lo);
+  short ansS_hi = (short)min(aS_hi, bS_hi);
+
+  // relu
+  if(ansS_lo < 0){ ansS_lo = 0; }
+  if(ansS_hi < 0){ ansS_hi = 0; }
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "max.s32 t1, %1, %2; \n\t"
+      "max.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(max(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  // Future asm code (naming/syntax may change):
+  asm ("{.reg .b32 t1; \n\t"
+      "max.s16x2 t1, %1, %2; \n\t"
+      "max.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
+  res = __vmaxs2(__vmaxs2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
+  short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "max.u32 t1, %1, %2; \n\t"
+      "max.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(max(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "max.u16x2 t1, %1, %2; \n\t"
+      "max.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxu2(__vmaxu2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)max(max(aU_lo, bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)max(max(aU_hi, bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "min.s32 t1, %1, %2; \n\t"
+      "min.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(min(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "min.s16x2 t1, %1, %2; \n\t"
+      "min.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
+  res = __vmins2(__vmins2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
+  short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "min.u32 t1, %1, %2; \n\t"
+      "min.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(min(a, b), c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "min.u16x2 t1, %1, %2; \n\t"
+      "min.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vminu2(__vminu2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)min(min(aU_lo, bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)min(min(aU_hi, bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "max.s32.relu t1, %1, %2; \n\t"
+      "max.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = max(max(a, b), c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "max.s16x2.relu t1, %1, %2; \n\t"
+      "max.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
+  res = __vimax_s16x2_relu(__vmaxs2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
+  short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);
+
+  // relu
+  if(ansS_lo < 0){ansS_lo = 0;}
+  if(ansS_hi < 0){ansS_hi = 0;}
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "min.s32.relu t1, %1, %2; \n\t"
+      "min.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = min(min(a, b), c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "min.s16x2.relu t1, %1, %2; \n\t"
+      "min.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
+  res = __vimin_s16x2_relu(__vmins2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
+  short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);
+
+  // relu
+  if(ansS_lo < 0){ansS_lo = 0;}
+  if(ansS_hi < 0){ansS_hi = 0;}
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "max.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "max.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxs2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
+  short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "add.u32 t1, %1, %2; \n\t"
+      "max.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return max(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.u16x2 t1, %1, %2; \n\t"
+      "max.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmaxu2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)max((unsigned short)(aU_lo + bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)max((unsigned short)(aU_hi + bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "min.s32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "min.s16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vmins2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
+  short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int res;
+  asm ("{.reg .u32 t1; \n\t"
+      "add.u32 t1, %1, %2; \n\t"
+      "min.u32 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  return min(a + b, c);
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.u16x2 t1, %1, %2; \n\t"
+      "min.u16x2 %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vminu2(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)min((unsigned short)(aU_lo + bU_lo), cU_lo);
+  unsigned short ansU_hi = (unsigned short)min((unsigned short)(aU_hi + bU_hi), cU_hi);
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "max.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = max(a + b, c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "max.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vimax_s16x2_relu(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
+  short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);
+
+  if(ansS_lo < 0){ansS_lo = 0;}
+  if(ansS_hi < 0){ansS_hi = 0;}
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32_relu(const int a, const int b, const int c){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int res;
+  asm ("{.reg .s32 t1; \n\t"
+      "add.s32 t1, %1, %2; \n\t"
+      "min.s32.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+  return res;
+#else
+  // Host and older architecture code
+  int ans = min(a + b, c);
+
+  return (ans > 0) ? ans : 0;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
+  unsigned int res;
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  asm ("{.reg .b32 t1; \n\t"
+      "add.s16x2 t1, %1, %2; \n\t"
+      "min.s16x2.relu %0, t1, %3;}\n\t"
+      : "=r"(res) : "r"(a), "r"(b), "r"(c));
+#elif defined(__CUDA_ARCH__)
+  res = __vimin_s16x2_relu(__vadd2(a, b), c);
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
+  unsigned short cU_hi = (unsigned short)(c >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  short cS_lo = *(short*)& cU_lo;
+  short cS_hi = *(short*)& cU_hi;
+
+  // Get answer
+  short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
+  short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);
+
+  if(ansS_lo < 0){ansS_lo = 0;}
+  if(ansS_hi < 0){ansS_hi = 0;}
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+#endif
+
+  return res;
+}
+
+// vimax vimin with predicate
+// *pred gets set to '(a >= b)'
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmax_s32(const int a, const int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.ge.s32  __$temp1, %2, %3;\n\t"
+      "  selp.s32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.s32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  int ans = max(a, b);
+
+  *pred = (a >= b);
+  return ans;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.ge.u32  __$temp1, %2, %3;\n\t"
+      "  selp.u32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.u32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  unsigned int ans = max(a, b);
+
+  *pred = (a >= b);
+  return ans;
+#endif
+}
+
+// *pred gets set to '(a <= b)'
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmin_s32(const int a, const int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.le.s32  __$temp1, %2, %3;\n\t"
+      "  selp.s32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.s32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  int ans = min(a, b);
+
+  *pred = (a <= b);
+  return ans;
+#endif
+}
+
+// *pred gets set to '(a <= b)'
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local;
+  asm ("{ .reg .pred __$temp1;\n\t"
+      "  setp.le.u32  __$temp1, %2, %3;\n\t"
+      "  selp.u32 %0, %2, %3, __$temp1;\n\t"
+      "  selp.u32 %1, 1, 0, __$temp1;}\n\t"
+      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
+
+  *pred = (bool)predicate_local;
+  return val;
+#else
+  // Host and older architecture code
+  unsigned int ans = min(a, b);
+
+  *pred = (a <= b);
+  return ans;
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .s16 rs0, rs1, rs2, rs3; \n\t"
+      "max.s16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.s16 pv, rs0, rs2; \n\t"
+      "setp.eq.s16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  // Get answer
+  short ansS_lo = (short)max(aS_lo, bS_lo);
+  short ansS_hi = (short)max(aS_hi, bS_hi);
+
+  *pred_hi = (aS_hi >= bS_hi);
+  *pred_lo = (aS_lo >= bS_lo);
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+
+  return ans;  
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
+      "max.u16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.u16 pv, rs0, rs2; \n\t"
+      "setp.eq.u16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)max(aU_lo, bU_lo);
+  unsigned short ansU_hi = (unsigned short)max(aU_hi, bU_hi);
+
+  *pred_hi = (aU_hi >= bU_hi);
+  *pred_lo = (aU_lo >= bU_lo);
+
+  // Put answer back together:
+  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+
+  return ans;  
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
+      "min.s16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.s16 pv, rs0, rs2; \n\t"
+      "setp.eq.s16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  //cast to signed:
+  short aS_lo = *(short*)& aU_lo;
+  short aS_hi = *(short*)& aU_hi;
+
+  short bS_lo = *(short*)& bU_lo;
+  short bS_hi = *(short*)& bU_hi;
+
+  // Get answer
+  short ansS_lo = (short)min(aS_lo, bS_lo);
+  short ansS_hi = (short)min(aS_hi, bS_hi);
+
+  *pred_hi = (aS_hi <= bS_hi);
+  *pred_lo = (aS_lo <= bS_lo);
+
+  // Cast back to unsigned:
+  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
+  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
+
+  // Put answer back together:
+  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+
+  return ans;  
+#endif
+}
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+  unsigned int val;
+  unsigned int predicate_local_hi;
+  unsigned int predicate_local_lo;
+  asm ("{.reg .pred pu, pv; \n\t"
+      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
+      "min.u16x2 %0, %3, %4; \n\t"
+      "mov.b32 {rs0, rs1}, %0; \n\t"
+      "mov.b32 {rs2, rs3}, %3; \n\t"
+      "setp.eq.u16 pv, rs0, rs2; \n\t"
+      "setp.eq.u16 pu, rs1, rs3; \n\t"
+      "selp.b32 %1, 1, 0, pu; \n\t"
+      "selp.b32 %2, 1, 0, pv;} \n\t"
+      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
+
+  *pred_hi = (bool)predicate_local_hi;
+  *pred_lo = (bool)predicate_local_lo;
+  return val;
+#else
+  // Host and older architecture code
+  // Separate our high and low bit:
+  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
+  unsigned short aU_hi = (unsigned short)(a >> 16);
+
+  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
+  unsigned short bU_hi = (unsigned short)(b >> 16);
+
+  // Get answer
+  unsigned short ansU_lo = (unsigned short)min(aU_lo, bU_lo);
+  unsigned short ansU_hi = (unsigned short)min(aU_hi, bU_hi);
+
+  *pred_hi = (aU_hi <= bU_hi);
+  *pred_lo = (aU_lo <= bU_lo);
+
+  // Put answer back together:
+  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
+
+  return ans;  
+#endif
+}
+
+#ifdef __CUDA_AND_AT_LEAST_SM_90__
+#undef __CUDA_AND_AT_LEAST_SM_90__
+#endif
+
+#undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* !__DEVICE_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/func_macro.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/func_macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..633554a01aaabd1bca5ae278c276710f323d5d7b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/func_macro.h
@@ -0,0 +1,57 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
+#endif
+
+#if !defined(__FUNC_MACRO_H__)
+#define __FUNC_MACRO_H__
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#error -- incorrect inclusion of a cudart header file
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+#if defined(__GNUC__)
+
+#define __func__(decl) \
+        inline decl
+
+#define __device_func__(decl) \
+        static __attribute__((__unused__)) decl
+
+#elif defined(_WIN32)
+
+#define __func__(decl) \
+        static inline decl
+
+#define __device_func__(decl) \
+        static decl
+
+#endif /* __GNUC__ */
+
+#endif /* __FUNC_MACRO_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_config.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..90367d877adc5157296d1052b30a6c495a6dc898
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_config.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
+
+#if !defined(__HOST_CONFIG_H__)
+#define __HOST_CONFIG_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+
+#define _CRTIMP
+#define __THROW
+
+#else /* __CUDACC_RTC__ */
+
+/* check for host compilers that are compatible with nvcc */
+#if !defined(__GNUC__) && !defined(_WIN32)
+
+#error --- !!! UNSUPPORTED COMPILER !!! ---
+
+#endif /* !__GNUC__ && !_WIN32 */
+
+/* check invalid configurations */
+#if defined(__PGIC__)
+#if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
+#error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
+#endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
+#endif  /* defined(__PGIC__) */
+
+#if defined(__powerpc__)
+#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
+#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
+#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
+#endif /* __powerpc__ */
+
+#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
+#error -- clang and clang++ are the only supported host compilers on Mac OS X!
+#endif /* __APPLE__ && __MACH__ && !__clang__ */
+
+
+/* check host compiler version  */
+#if !__NV_NO_HOST_COMPILER_CHECK
+
+#if defined(__ICC)
+
+#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
+
+#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+ 
+#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
+
+#endif /* __ICC */
+
+#if defined(__GRCO_CLANG_COMPILER__)
+#if (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 17))
+#error -- unsupported Grace clang version! The version must be 16.x to 17.x. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif  /* (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 17)) */
+
+#endif /* __GRCO_CLANG_COMPILER__  */
+
+#if defined(__INTEL_CLANG_COMPILER)
+#error -- unsupported Intel ICX compiler! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif /* __INTEL_CLANG_COMPILER */
+
+#if defined(__powerpc__)
+
+#if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
+                              !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
+
+#error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
+                           !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
+
+#endif /* __powerpc__ */
+
+#if defined(__GNUC__)
+
+#if __GNUC__ > 13
+
+#error -- unsupported GNU version! gcc versions later than 13 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __GNUC__ > 13 */
+
+
+#if defined(__HORIZON__)
+#if (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
+#error -- unsupported HOS clang version! The version must be must be less than 18 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif  /* (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
+#endif /* __HORIZON__  */
+
+#if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__)
+
+#if (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
+#error -- unsupported clang version! clang version must be less than 18 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif  /* (__clang_major__ >=  18) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
+
+#endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__) */
+
+
+#endif /* __GNUC__ */
+
+#if defined(_WIN32)
+
+#if _MSC_VER < 1910 || _MSC_VER >= 1950
+
+#error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#elif _MSC_VER >= 1910 && _MSC_VER < 1910
+
+#pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
+
+#endif /* (_MSC_VER < 1910 || _MSC_VER >= 1950) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
+
+#endif /* _WIN32 */
+#endif  /* !__NV_NO_HOST_COMPILER_CHECK */
+
+
+/* configure host compiler */
+#if defined(__APPLE__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#if defined(__BLOCKS__) /* nvcc does not support closures */
+
+#undef __BLOCKS__
+
+#endif /* __BLOCKS__ */
+
+#elif defined(__ANDROID__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__QNX__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__HORIZON__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__GNUC__)
+
+#define _CRTIMP
+#define _ACRTIMP
+
+#include <features.h> /* for __THROW */
+
+#elif defined(_WIN32)
+
+#if _MSC_VER >= 1500
+
+#undef _USE_DECLSPECS_FOR_SAL
+#define _USE_DECLSPECS_FOR_SAL \
+        1
+
+#endif /* _MSC_VER >= 1500 */
+
+#if !defined(_CRT_NONSTDC_NO_WARNINGS)
+
+#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_NONSTDC_NO_WARNINGS */
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+
+#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_SECURE_NO_WARNINGS */
+
+#if !defined(NOMINMAX)
+
+#define NOMINMAX /* min and max are part of cuda runtime */
+
+#endif /* !NOMINMAX */
+
+#include <crtdefs.h> /* for _CRTIMP */
+#if _MSC_VER >= 1900
+#include <corecrt.h> /* for _ACRTIMP */
+#endif /* _MSC_VER >= 1900 */
+
+#define __THROW
+
+#endif /* __APPLE__ */
+
+#endif /* __CUDACC_RTC__ */
+
+
+#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
+
+#if __CUDACC_RTC__
+typedef char *va_list;
+#else /* !__CUDACC_RTC__ */
+#include <cstdarg>
+#endif /* __CUDACC_RTC__ */
+
+
+#undef va_start
+#undef va_end
+#undef va_arg
+
+#ifdef __PGIC__
+
+#undef __builtin_va_end
+
+#define va_start(v,l) __builtin_alt_va_start(v,l)
+#define va_end(v) __builtin_va_end(v)
+#define va_arg(v,l) __builtin_alt_va_arg(v,l)
+
+#if (__cplusplus >= 201103L)
+#undef va_copy
+#define va_copy(d,s)  __builtin_va_copy(d,s)
+#endif
+
+#else /* !__PGIC__ */
+
+
+#define va_start(ap, x) (__cu_va_start(&ap, x))
+#define va_end(ap) (__cu_va_end(&ap))
+#define va_arg(ap, t)  (*((t *)__cu_va_arg(&ap, (t *)0)))
+
+#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
+#undef va_copy
+#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
+#endif /* (_MSC_VER >= 1800)  || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
+#endif /* __PGIC__ */
+
+#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
+
+
+
+#endif /* __CUDACC__ */
+
+#endif /* !__HOST_CONFIG_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_defines.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..86aeb1e07620cf6a6c91ed8ca8a0bfc720c4268e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_defines.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/host_defines.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
+#endif
+
+#if !defined(__HOST_DEFINES_H__)
+#define __HOST_DEFINES_H__
+
+#if defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDADEVRT_INTERNAL__) && !defined(_ALLOW_UNSUPPORTED_LIBCPP)
+#include <ctype.h>
+#if ((defined(_MSC_VER ) && (defined(_M_X64) || defined(_M_AMD64))) ||\
+     (defined(__x86_64__) || defined(__amd64__))) && defined(_LIBCPP_VERSION) && !(defined(__HORIZON__) || defined(__ANDROID__) || defined(__QNX__))
+#error "libc++ is not supported on x86 system"
+#endif
+#endif
+
+/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
+#if defined(__GNUC__) || (defined(__PGIC__) && defined(__linux__)) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
+
+#if defined(__CUDACC_RTC__)
+#define __volatile__ volatile
+#endif /* __CUDACC_RTC__ */
+
+#define __no_return__ \
+        __attribute__((noreturn))
+        
+#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
+/* gcc allows users to define attributes with underscores, 
+   e.g., __attribute__((__noinline__)).
+   Consider a non-CUDA source file (e.g. .cpp) that has the 
+   above attribute specification, and includes this header file. In that case,
+   defining __noinline__ as below  would cause a gcc compilation error.
+   Hence, only define __noinline__ when the code is being processed
+   by a  CUDA compiler component.
+*/   
+#define __noinline__ \
+        __attribute__((noinline))
+#endif /* __CUDACC__  || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
+
+#undef __forceinline__
+#define __forceinline__ \
+        __inline__ __attribute__((always_inline))
+#define __inline_hint__ \
+        __attribute__((nv_inline_hint))
+#define __align__(n) \
+        __attribute__((aligned(n)))
+#define __maxnreg__(a) \
+        __attribute__((maxnreg(a)))
+#define __thread__ \
+        __thread
+#define __import__
+#define __export__
+#define __cdecl
+#define __annotate__(a) \
+        __attribute__((a))
+#define __location__(a) \
+        __annotate__(a)
+#define CUDARTAPI
+#define CUDARTAPI_CDECL
+
+#elif defined(_MSC_VER)
+
+#if _MSC_VER >= 1400
+
+#define __restrict__ \
+        __restrict
+
+#else /* _MSC_VER >= 1400 */
+
+#define __restrict__
+
+#endif /* _MSC_VER >= 1400 */
+
+#define __inline__ \
+        __inline
+#define __no_return__ \
+        __declspec(noreturn)
+#define __noinline__ \
+        __declspec(noinline)
+#define __forceinline__ \
+        __forceinline
+#define __inline_hint__ \
+        __declspec(nv_inline_hint)
+#define __align__(n) \
+        __declspec(align(n))
+#define __maxnreg__(n) \
+        __declspec(maxnreg(n))
+#define __thread__ \
+        __declspec(thread)
+#define __import__ \
+        __declspec(dllimport)
+#define __export__ \
+        __declspec(dllexport)
+#define __annotate__(a) \
+        __declspec(a)
+#define __location__(a) \
+        __annotate__(__##a##__)
+#define CUDARTAPI \
+        __stdcall
+#define CUDARTAPI_CDECL \
+        __cdecl
+
+#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#define __inline__
+
+#if !defined(__align__)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
+
+#endif /* !__align__ */
+
+#if !defined(CUDARTAPI)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
+
+#endif /* !CUDARTAPI */
+
+#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
+    (defined(_MSC_VER) && _MSC_VER < 1900) || \
+    (!defined(__GNUC__) && !defined(_MSC_VER))
+
+#define __specialization_static \
+        static
+
+#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#define __specialization_static
+
+#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
+
+#undef __annotate__
+#define __annotate__(a)
+
+#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
+
+#define __launch_bounds__(...) \
+        __annotate__(launch_bounds(__VA_ARGS__))
+
+#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
+
+#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
+    defined(__GNUC__) || defined(_WIN64)
+
+#define __builtin_align__(a) \
+        __align__(a)
+
+#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
+
+#define __builtin_align__(a)
+
+#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__  || _WIN64 */
+
+#if defined(__CUDACC__) || !defined(__grid_constant__)
+#define __grid_constant__ \
+        __location__(grid_constant)
+#endif /* defined(__CUDACC__) || !defined(__grid_constant__) */
+        
+#if defined(__CUDACC__) || !defined(__host__)
+#define __host__ \
+        __location__(host)
+#endif /* defined(__CUDACC__) || !defined(__host__) */
+#if defined(__CUDACC__) || !defined(__device__)
+#define __device__ \
+        __location__(device)
+#endif /* defined(__CUDACC__) || !defined(__device__) */
+#if defined(__CUDACC__) || !defined(__global__)
+#define __global__ \
+        __location__(global)
+#endif /* defined(__CUDACC__) || !defined(__global__) */
+#if defined(__CUDACC__) || !defined(__shared__)
+#define __shared__ \
+        __location__(shared)
+#endif /* defined(__CUDACC__) || !defined(__shared__) */
+#if defined(__CUDACC__) || !defined(__constant__)
+#define __constant__ \
+        __location__(constant)
+#endif /* defined(__CUDACC__) || !defined(__constant__) */
+#if defined(__CUDACC__) || !defined(__managed__)
+#define __managed__ \
+        __location__(managed)
+#endif /* defined(__CUDACC__) || !defined(__managed__) */
+        
+#if !defined(__CUDACC__)
+#define __device_builtin__
+#define __device_builtin_texture_type__
+#define __device_builtin_surface_type__
+#define __cudart_builtin__
+#else /* defined(__CUDACC__) */
+#define __device_builtin__ \
+        __location__(device_builtin)
+#define __device_builtin_texture_type__ \
+        __location__(device_builtin_texture_type)
+#define __device_builtin_surface_type__ \
+        __location__(device_builtin_surface_type)
+#define __cudart_builtin__ \
+        __location__(cudart_builtin)
+#endif /* !defined(__CUDACC__) */
+
+#if defined(__CUDACC__) || !defined(__cluster_dims__)
+#if defined(_MSC_VER)        
+#define __cluster_dims__(...) \
+        __declspec(__cluster_dims__(__VA_ARGS__))
+        
+#else  /* !defined(_MSC_VER) */
+#define __cluster_dims__(...) \
+        __attribute__((cluster_dims(__VA_ARGS__)))
+#endif  /* defined(_MSC_VER) */
+#endif  /* defined(__CUDACC__) || !defined(__cluster_dims__) */
+
+#define __CUDA_ARCH_HAS_FEATURE__(_FEAT) __CUDA_ARCH_FEAT_##_FEAT
+
+#endif /* !__HOST_DEFINES_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_runtime.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..22e3a1bea875ddb2a15075f6e0ecb10b7ce1a6a7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_runtime.h
@@ -0,0 +1,306 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
+#endif
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#define __CUDA_INTERNAL_COMPILATION__
+#define __text__
+#define __surf__
+#define __name__shadow_var(c, cpp) \
+        #c
+#define __name__text_var(c, cpp) \
+        #cpp
+#define __host__shadow_var(c, cpp) \
+        cpp
+#define __text_var(c, cpp) \
+        cpp
+#define __device_fun(fun) \
+        #fun
+#define __device_var(var) \
+        #var
+#define __device__text_var(c, cpp) \
+        #c
+#define __device__shadow_var(c, cpp) \
+        #c
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+#define __pad__(f) \
+        f
+
+#else /* _WIN32 && !_WIN64 */
+
+#define __pad__(f)
+
+#endif /* _WIN32 && !_WIN64 */
+
+#include "builtin_types.h"
+#include "storage_class.h"
+
+#else /* !__CUDA_INTERNAL_COMPILATION__ */
+
+template <typename T>
+static inline T *__cudaAddressOf(T &val) 
+{
+    return (T *)((void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(val)))));
+}
+
+#define __cudaRegisterBinary(X)                                                   \
+        __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
+        { void (*callback_fp)(void **) =  (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
+        atexit(__cudaUnregisterBinaryUtil)
+        
+#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
+        __cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
+#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
+        __cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
+
+#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
+        __cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
+#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
+        __cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
+#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
+        __cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
+
+extern "C" cudaError_t CUDARTAPI __cudaPopCallConfiguration(
+  dim3         *gridDim,
+  dim3         *blockDim,
+  size_t       *sharedMem,
+  void         *stream
+);
+
+#define __cudaLaunchPrologue(size) \
+        void * __args_arr[size]; \
+        int __args_idx = 0
+        
+#define __cudaSetupArg(arg, offset) \
+        __args_arr[__args_idx] = (void *)__cudaAddressOf(arg); ++__args_idx
+          
+#define __cudaSetupArgSimple(arg, offset) \
+        __args_arr[__args_idx] = (void *)(char *)&arg; ++__args_idx
+        
+#if defined(__GNUC__)
+#define __NV_ATTR_UNUSED_FOR_LAUNCH __attribute__((unused))
+#else  /* !__GNUC__ */
+#define __NV_ATTR_UNUSED_FOR_LAUNCH
+#endif  /* __GNUC__ */
+
+#ifdef __NV_LEGACY_LAUNCH
+/* the use of __args_idx in the expression below avoids host compiler warning about it being an
+   unused variable when the launch has no arguments */
+#define __cudaLaunch(fun) \
+        { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH;  __f = fun; \
+          dim3 __gridDim, __blockDim;\
+          size_t __sharedMem; \
+          cudaStream_t __stream; \
+          if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
+            return; \
+          if (__args_idx == 0) {\
+            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
+          } else { \
+            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
+          }\
+        }
+#else  /* !__NV_LEGACY_LAUNCH */
+#define __cudaLaunch(fun) \
+        { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH;  __f = fun; \
+          static cudaKernel_t __handle = 0; \
+          volatile static bool __tmp __NV_ATTR_UNUSED_FOR_LAUNCH = (__cudaGetKernel(&__handle, (const void *)fun) == cudaSuccess); \
+          dim3 __gridDim, __blockDim;\
+          size_t __sharedMem; \
+          cudaStream_t __stream; \
+          if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
+            return; \
+          if (__args_idx == 0) {\
+            (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
+          } else { \
+            (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
+          }\
+        }
+#endif  /* __NV_LEGACY_LAUNCH */
+
+#if defined(__GNUC__)
+#define __nv_dummy_param_ref(param) \
+        { volatile static void **__ref __attribute__((unused)); __ref = (volatile void **)param; }
+#else /* __GNUC__ */
+#define __nv_dummy_param_ref(param) \
+        { volatile static void **__ref; __ref = (volatile void **)param; }
+#endif /* __GNUC__ */
+
+static void ____nv_dummy_param_ref(void *param) __nv_dummy_param_ref(param)
+
+#define __REGISTERFUNCNAME_CORE(X) __cudaRegisterLinkedBinary##X
+#define __REGISTERFUNCNAME(X) __REGISTERFUNCNAME_CORE(X)
+
+extern "C" {
+void __REGISTERFUNCNAME( __NV_MODULE_ID ) ( void (*)(void **), void *, void *, void (*)(void *));
+}
+
+#define __TO_STRING_CORE(X) #X
+#define __TO_STRING(X) __TO_STRING_CORE(X)
+
+extern "C" {
+#if defined(_WIN32)
+#pragma data_seg("__nv_module_id")
+  static const __declspec(allocate("__nv_module_id")) unsigned char __module_id_str[] = __TO_STRING(__NV_MODULE_ID);
+#pragma data_seg()
+#elif defined(__APPLE__)
+  static const unsigned char __module_id_str[] __attribute__((section ("__NV_CUDA,__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
+#else
+  static const unsigned char __module_id_str[] __attribute__((section ("__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
+#endif
+
+#undef __FATIDNAME_CORE
+#undef __FATIDNAME
+#define __FATIDNAME_CORE(X) __fatbinwrap##X
+#define __FATIDNAME(X) __FATIDNAME_CORE(X)
+
+#define  ____cudaRegisterLinkedBinary(X) \
+{ __REGISTERFUNCNAME(__NV_MODULE_ID) (( void (*)(void **))(X), (void *)&__FATIDNAME(__NV_MODULE_ID), (void *)&__module_id_str, (void (*)(void *))&____nv_dummy_param_ref); }
+
+}
+
+extern "C" {
+extern void** CUDARTAPI __cudaRegisterFatBinary(
+  void *fatCubin
+);
+
+extern void CUDARTAPI __cudaRegisterFatBinaryEnd(
+  void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaUnregisterFatBinary(
+  void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaRegisterVar(
+        void **fatCubinHandle,
+        char  *hostVar,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        size_t size,
+        int    constant,
+        int    global
+);
+
+extern void CUDARTAPI __cudaRegisterManagedVar(
+        void **fatCubinHandle,
+        void **hostVarPtrAddress,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        size_t size,
+        int    constant,
+        int    global
+);
+
+extern char CUDARTAPI __cudaInitModule(
+        void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaRegisterTexture(
+        void                    **fatCubinHandle,
+  const struct textureReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,       
+        int                       norm,      
+        int                        ext        
+);
+
+extern void CUDARTAPI __cudaRegisterSurface(
+        void                    **fatCubinHandle,
+  const struct surfaceReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,       
+        int                       ext        
+);
+
+extern void CUDARTAPI __cudaRegisterFunction(
+        void   **fatCubinHandle,
+  const char    *hostFun,
+        char    *deviceFun,
+  const char    *deviceName,
+        int      thread_limit,
+        uint3   *tid,
+        uint3   *bid,
+        dim3    *bDim,
+        dim3    *gDim,
+        int     *wSize
+);
+
+#if defined(__APPLE__)
+extern "C" int atexit(void (*)(void));
+
+#elif  defined(__GNUC__) && !defined(__ANDROID__) && !defined(__HORIZON__)
+extern int atexit(void(*)(void)) throw();
+
+#elif defined(__HORIZON__)
+
+// __TEMP_WAR__ 200132570 HOS : Disable atexit call until it works
+#define atexit(p)
+
+#else /* __GNUC__ && !__ANDROID__ */
+extern int __cdecl atexit(void(__cdecl *)(void));
+#endif
+
+}
+
+static void **__cudaFatCubinHandle;
+
+static void __cdecl __cudaUnregisterBinaryUtil(void)
+{
+  ____nv_dummy_param_ref((void *)&__cudaFatCubinHandle);
+  __cudaUnregisterFatBinary(__cudaFatCubinHandle);
+}
+
+static char __nv_init_managed_rt_with_module(void **handle)
+{
+  return __cudaInitModule(handle);
+}
+
+#include "common_functions.h"
+
+#pragma pack()
+
+#if defined(_WIN32)
+
+#pragma warning(disable: 4099)
+
+#if !defined(_WIN64)
+
+#pragma warning(disable: 4408)
+
+#endif /* !_WIN64 */
+
+#endif /* _WIN32 */
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..1896e15d16e5f629655746faca4f05baf9e5a83c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.h
@@ -0,0 +1,12208 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/math_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/math_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__
+#endif
+
+#if !defined(__MATH_FUNCTIONS_H__)
+#define __MATH_FUNCTIONS_H__
+
+#if defined(__QNX__) && (__GNUC__ >= 5) && defined(__CUDACC__)
+#if __has_include(<__config>)
+#include <__config>
+#endif
+#endif
+
+/**
+ * \defgroup CUDA_MATH Mathematical Functions
+ *
+ * CUDA mathematical functions are always available in device code.
+ *
+ * Host implementations of the common mathematical functions are mapped
+ * in a platform-specific way to standard math library functions, provided
+ * by the host compiler and respective host libm where available.
+ * Some functions, not available with the host compilers, are implemented
+ * in crt/math_functions.hpp header file.
+ * For example, see ::erfinv(). Other, less common functions,
+ * like ::rhypot(), ::cyl_bessel_i0() are only available in device code.
+ *
+ * Note that many floating-point and integer functions names are
+ * overloaded for different argument types. For example, the ::log()
+ * function has the following prototypes:
+ * \code
+ * double log(double x);
+ * float log(float x);
+ * float logf(float x);
+ * \endcode
+ *
+ * Note also that due to implementation constraints, certain math functions
+ * from std:: namespace may be callable in device code even via explicitly
+ * qualified std:: names. However, such use is discouraged, since this
+ * capability is unsupported, unverified, undocumented, not portable, and
+ * may change without notice.
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+extern "C"
+{
+
+/* Define math function DOXYGEN toplevel groups, functions will
+   be added to these groups later.
+*/
+/**
+ * \defgroup CUDA_MATH_SINGLE Single Precision Mathematical Functions
+ * This section describes single precision mathematical functions.
+ * To use these functions you do not need to include any additional 
+ * header files in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_DOUBLE Double Precision Mathematical Functions
+ * This section describes double precision mathematical functions.
+ * To use these functions you do not need to include any additional 
+ * header files in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INT Integer Mathematical Functions
+ * This section describes integer mathematical functions.
+ * To use these functions you do not need to include any additional
+ * header files in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_SINGLE Single Precision Intrinsics
+ * This section describes single precision intrinsic functions that are
+ * only supported in device code.
+ * To use these functions you do not need to include any additional 
+ * header files in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_DOUBLE Double Precision Intrinsics
+ * This section describes double precision intrinsic functions that are
+ * only supported in device code.
+ * To use these functions you do not need to include any additional 
+ * header files in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_INT Integer Intrinsics
+ * This section describes integer intrinsic functions that are
+ * only supported in device code.
+ * To use these functions you do not need to include any additional 
+ * header files in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_CAST Type Casting Intrinsics
+ * This section describes type casting intrinsic functions that are
+ * only supported in device code.
+ * To use these functions you do not need to include any additional 
+ * header files in your program.
+ */
+
+/**
+ *
+ * \defgroup CUDA_MATH_INTRINSIC_SIMD SIMD Intrinsics
+ * This section describes SIMD intrinsic functions that are
+ * only supported in device code.
+ * To use these functions you do not need to include any additional 
+ * header files in your program.
+ */
+
+
+/**
+ * @}
+ */
+#define __DEVICE_FUNCTIONS_DECL__ __host__ __device__
+#if !defined(_MSC_VER)
+#define __CUDA_MATH_CRTIMP
+#else
+#if _MSC_VER < 1900
+#define __CUDA_MATH_CRTIMP _CRTIMP
+#else
+#define __CUDA_MATH_CRTIMP _ACRTIMP
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ <= 20) && !defined(__aarch64__)
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ int                    abs(int);
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long int               labs(long int);
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long long int          llabs(long long int);
+#else /* __ANDROID__ */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ int            __cdecl abs(int a) __THROW;
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long int       __cdecl labs(long int a) __THROW;
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long long int          llabs(long long int a) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+}
+#endif
+#endif /* __ANDROID__ */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+/* put all math functions in std */
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the absolute value of the input argument.
+ *
+ * Calculate the absolute value of the input argument \p x.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - fabs(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fabs(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl fabs(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the absolute value of its argument
+ *
+ * Calculate the absolute value of the input argument \p x.
+ *
+ * \return
+ * Returns the absolute value of its argument.
+ * - fabsf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fabsf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - fabsf(NaN) returns an unspecified NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fabsf(float x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    min(const int a, const int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           umin(const unsigned int a, const unsigned int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llmin(const long long int a, const long long int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int ullmin(const unsigned long long int a, const unsigned long long int b);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * Determines the minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the minimum numeric value of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fminf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fminf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * Determines the minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the minimum numeric value of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fmin(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fmin(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    max(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           umax(const unsigned int a, const unsigned int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llmax(const long long int a, const long long int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int ullmax(const unsigned long long int a, const unsigned long long int b);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * Determines the maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the maximum numeric values of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmaxf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fmaxf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * Determines the maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the maximum numeric values of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fmax(double, double) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fmax(double, double);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine of the input argument.
+ *
+ * Calculate the sine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - sin(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sin(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sin(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cosine of the input argument.
+ *
+ * Calculate the cosine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - cos(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - cos(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl cos(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine and cosine of the first input argument.
+ *
+ * Calculate the sine and cosine of the first input argument \p x (measured 
+ * in radians). The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \return 
+ * - none
+ *
+ * \see ::sin() and ::cos().
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincos(double x, double *sptr, double *cptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine and cosine of the first input argument.
+ *
+ * Calculate the sine and cosine of the first input argument \p x (measured
+ * in radians). The results for sine and cosine are written into the second 
+ * argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \return 
+ * - none
+ *
+ * \see ::sinf() and ::cosf().
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincosf(float x, float *sptr, float *cptr) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the tangent of the input argument.
+ *
+ * Calculate the tangent of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - tan(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - tan(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl tan(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the input argument.
+ *
+ * Calculate the nonnegative square root of \p x, 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sqrt(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sqrt(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sqrt(\p x) returns NaN if \p x is less than 0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sqrt(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the reciprocal of the square root of the input argument.
+ *
+ * Calculate the reciprocal of the nonnegative square root of \p x, 
+ * \latexonly $1/\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mn>1</m:mn>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * Returns 
+ * \latexonly $1/\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mn>1</m:mn>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rsqrt(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - rsqrt(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rsqrt(\p x) returns NaN if \p x is less than 0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the reciprocal of the square root of the input argument.
+ *
+ * Calculate the reciprocal of the nonnegative square root of \p x, 
+ * \latexonly $1/\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mn>1</m:mn>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * Returns 
+ * \latexonly $1/\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mn>1</m:mn>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rsqrtf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - rsqrtf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rsqrtf(\p x) returns NaN if \p x is less than 0.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 2 logarithm of the input argument.
+ *
+ * Calculate the base 2 logarithm of the input argument \p x.
+ *
+ * \return 
+ * - log2(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - log2(1) returns +0.
+ * - log2(\p x) returns NaN for \p x < 0.
+ * - log2(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 log2(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl log2(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 2 exponential of the input argument.
+ * 
+ * Calculate
+ * \latexonly $2^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * the base 2 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp2(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - exp2(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns +0.
+ * - exp2(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 exp2(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl exp2(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 2 exponential of the input argument.
+ *
+ * Calculate
+ * \latexonly $2^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * the base 2 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp2f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - exp2f(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns +0.
+ * - exp2f(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  exp2f(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl exp2f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 10 exponential of the input argument.
+ *
+ * Calculate
+ * \latexonly $10^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>10</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * the base 10 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp10(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - exp10(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns +0.
+ * - exp10(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */         
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 exp10(double x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 10 exponential of the input argument.
+ *
+ * Calculate
+ * \latexonly $10^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>10</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * the base 10 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp10f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - exp10f(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns +0.
+ * - exp10f(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  exp10f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument, minus 1.
+ *
+ * Calculate
+ * \latexonly $e^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * -1, the base
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument \p x, minus 1.
+ *
+ * \return
+ * - expm1(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - expm1(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns -1.
+ * - expm1(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 expm1(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl expm1(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument, minus 1.
+ *
+ * Calculate
+ * \latexonly $e^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * -1, the base
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument \p x, minus 1.
+ *
+ * \return
+ * - expm1f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - expm1f(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns -1.
+ * - expm1f(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  expm1f(float x) __THROW;        
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl expm1f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 2 logarithm of the input argument.
+ *
+ * Calculate the base 2 logarithm of the input argument \p x.
+ *
+ * \return
+ * - log2f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - log2f(1) returns +0.
+ * - log2f(\p x) returns NaN for \p x < 0.
+ * - log2f(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log2f(float x) __THROW;         
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl log2f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 10 logarithm of the input argument.
+ *
+ * Calculate the base 10 logarithm of the input argument \p x.
+ *
+ * \return
+ * - log10(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - log10(1) returns +0.
+ * - log10(\p x) returns NaN for \p x < 0.
+ * - log10(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl log10(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  logarithm of the input argument.
+ *
+ * Calculate the base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  logarithm of the input argument \p x.
+ *
+ * \return
+ * - log(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - log(1) returns +0.
+ * - log(\p x) returns NaN for \p x < 0.
+ * - log(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl log(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of 
+ * \latexonly $\log_{e}(1+x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mn>log</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *     </m:mrow>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mn>1</m:mn>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the value of 
+ * \latexonly $\log_{e}(1+x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mn>log</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *     </m:mrow>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mn>1</m:mn>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * of the input argument \p x.
+ *
+ * \return
+ * - log1p(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - log1p(-1) returns
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - log1p(\p x) returns NaN for \p x < -1.
+ * - log1p(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 log1p(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl log1p(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of 
+ * \latexonly $\log_{e}(1+x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mn>log</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *     </m:mrow>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mn>1</m:mn>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the value of 
+ * \latexonly $\log_{e}(1+x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mn>log</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *     </m:mrow>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mn>1</m:mn>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * of the input argument \p x.
+ *
+ * \return
+ * - log1pf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - log1pf(-1) returns
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - log1pf(\p x) returns NaN for \p x < -1.
+ * - log1pf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log1pf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl log1pf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the largest integer less than or equal to \p x.
+ * 
+ * Calculates the largest integer value which is less than or equal to \p x.
+ * 
+ * \return
+ * Returns 
+ * \latexonly $\lfloor x \rfloor$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo fence="false" stretchy="false">&#x230A;<!-- &Lfloor --></m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo fence="false" stretchy="false">&#x230B;<!-- &Rfloor --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  expressed as a floating-point number.
+ * - floor(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - floor(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl floor(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument.
+ *
+ * Calculate
+ * \latexonly $e^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * the base
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument \p x.
+ *
+ * \return
+ * - exp(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - exp(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns +0.
+ * - exp(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl exp(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic cosine of the input argument.
+ *
+ * Calculate the hyperbolic cosine of the input argument \p x.
+ *
+ * \return
+ * - cosh(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - cosh(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl cosh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic sine of the input argument.
+ *
+ * Calculate the hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - sinh(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sinh(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sinh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic tangent of the input argument.
+ *
+ * Calculate the hyperbolic tangent of the input argument \p x.
+ *
+ * \return
+ * - tanh(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - tanh( 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl tanh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * Calculate the nonnegative inverse hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ].
+ * - acosh(1) returns 0.
+ * - acosh(\p x) returns NaN for \p x in the interval [
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 1).
+ * - acosh( 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 acosh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl acosh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * Calculate the nonnegative inverse hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ].
+ * - acoshf(1) returns 0.
+ * - acoshf(\p x) returns NaN for \p x in the interval [
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 1).
+ * - acoshf( 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  acoshf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl acoshf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse hyperbolic sine of the input argument.
+ *
+ * Calculate the inverse hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - asinh(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">0</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">0</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - asinh(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 asinh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl asinh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse hyperbolic sine of the input argument.
+ *
+ * Calculate the inverse hyperbolic sine of the input argument \p x.
+ *
+ * \return 
+ * - asinhf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">0</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">0</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - asinhf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * 
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  asinhf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl asinhf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse hyperbolic tangent of the input argument.
+ *
+ * Calculate the inverse hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - atanh(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - atanh(
+ * \latexonly $\pm 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - atanh(\p x) returns NaN for \p x outside interval [-1, 1].
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 atanh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl atanh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse hyperbolic tangent of the input argument.
+ *
+ * Calculate the inverse hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - atanhf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - atanhf(
+ * \latexonly $\pm 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - atanhf(\p x) returns NaN for \p x outside interval [-1, 1].
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atanhf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl atanhf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of 
+ * \latexonly $x\cdot 2^{exp}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *       <m:mi>x</m:mi>
+ *       <m:mi>p</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the value of 
+ * \latexonly $x\cdot 2^{exp}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *       <m:mi>x</m:mi>
+ *       <m:mi>p</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  of the input arguments \p x and \p exp.
+ *
+ * \return 
+ * - ldexp(\p x, \p exp) is equivalent to scalbn(\p x, \p exp).
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl ldexp(double x, int exp) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of 
+ * \latexonly $x\cdot 2^{exp}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *       <m:mi>x</m:mi>
+ *       <m:mi>p</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the value of 
+ * \latexonly $x\cdot 2^{exp}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *       <m:mi>x</m:mi>
+ *       <m:mi>p</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  of the input arguments \p x and \p exp.
+ *
+ * \return 
+ * - ldexpf(\p x, \p exp) is equivalent to scalbnf(\p x, \p exp).
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ldexpf(float x, int exp) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the floating-point representation of the exponent of the input argument.
+ *
+ * Calculate the floating-point representation of the exponent of the input argument \p x.
+ *
+ * \return 
+ * - logb(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - logb(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 logb(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl logb(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the floating-point representation of the exponent of the input argument.
+ *
+ * Calculate the floating-point representation of the exponent of the input argument \p x.
+ *
+ * \return 
+ * - logbf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - logbf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  logbf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl logbf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute the unbiased integer exponent of the argument.
+ *
+ * Calculates the unbiased integer exponent of the input argument \p x.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - ilogb(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns <tt>INT_MIN</tt>.
+ * - ilogb(NaN) returns <tt>INT_MIN</tt>.
+ * - ilogb(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    ilogb(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP int    __cdecl ilogb(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute the unbiased integer exponent of the argument.
+ *
+ * Calculates the unbiased integer exponent of the input argument \p x.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - ilogbf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns <tt>INT_MIN</tt>.
+ * - ilogbf(NaN) returns <tt>INT_MIN</tt>.
+ * - ilogbf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    ilogbf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP int    __cdecl ilogbf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalbn(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalbn(\p x, 0) returns \p x.
+ * - scalbn(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 scalbn(double x, int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl scalbn(double x, int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalbnf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalbnf(\p x, 0) returns \p x.
+ * - scalbnf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  scalbnf(float x, int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl scalbnf(float x, int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalbln(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalbln(\p x, 0) returns \p x.
+ * - scalbln(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 scalbln(double x, long int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl scalbln(double x, long int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \latexonly $2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalblnf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - scalblnf(\p x, 0) returns \p x.
+ * - scalblnf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p n) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  scalblnf(float x, long int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl scalblnf(float x, long int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Extract mantissa and exponent of a floating-point value
+ * 
+ * Decompose the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and another term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to  0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \latexonly $x = m\cdot 2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>=</m:mo>
+ *   <m:mi>m</m:mi>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * Returns the fractional component \p m.
+ * - frexp(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p nptr) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  and stores zero in the location pointed to by \p nptr.
+ * - frexp(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p nptr) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - frexp(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl frexp(double x, int *nptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Extract mantissa and exponent of a floating-point value
+ * 
+ * Decomposes the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and another term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to  0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \latexonly $x = m\cdot 2^n$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>=</m:mo>
+ *   <m:mi>m</m:mi>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:msup>
+ *     <m:mn>2</m:mn>
+ *     <m:mi>n</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * Returns the fractional component \p m.
+ * - frexpf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p nptr) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  and stores zero in the location pointed to by \p nptr.
+ * - frexpf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p nptr) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - frexpf(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  frexpf(float x, int *nptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - round(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - round(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_slow_round See ::rint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 round(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl round(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return
+ * Returns rounded integer value.
+ * - roundf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - roundf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_slow_round See ::rintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  roundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl roundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::lrint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lround(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lround(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::lrintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lroundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lroundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::llrint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llround(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llround(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::llrintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llroundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llroundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - rint(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rint(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if defined(__CUDA_ARCH__) || defined(__DOXYGEN_ONLY__)
+/*
+ * We don't generate the declaration of rint for host compilation.
+ * This is acaully a workaround to compile the boost header file when
+ * Clang 3.8 is used as the host compiler. The boost header file has
+ * the following example code:
+ *   namespace NS { extern "C" { double rint(double); }
+ *   }
+ *
+ * After preprocessing, we get something like below:
+ *
+ * extern "C" { double rint(double x) throw(); }
+ * # 30 "/usr/include/math.h" 3
+ * extern "C" { double rint(double x) throw(); }
+ * namespace NS { extern "C" { double rint(double); } }
+ *
+ * Although GCC accepts this output, Clang 3.8 doesn't.
+ * Furthermore, we cannot change the boost header file by adding "throw()"
+ * to rint's declaration there. So, as a workaround, we just don't generate
+ * our re-declaration for the host compilation.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl rint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#endif /* __CUDA_ARCH__ || __DOXYGEN_ONLY__ */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - rintf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rintf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl rintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lrint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lrint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lrintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lrintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llrint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llrint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llrintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llrintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round the input argument to the nearest integer.
+ *
+ * Round argument \p x to an integer value in double precision floating-point format. Uses round to nearest rounding, with ties rounding to even.
+ *
+ * \return 
+ * - nearbyint(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - nearbyint(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nearbyint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nearbyint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round the input argument to the nearest integer.
+ *
+ * Round argument \p x to an integer value in single precision floating-point format. Uses round to nearest rounding, with ties rounding to even.
+ *
+ * \return 
+ * - nearbyintf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - nearbyintf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nearbyintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nearbyintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate ceiling of the input argument.
+ *
+ * Compute the smallest integer value not less than \p x.
+ *
+ * \return
+ * Returns 
+ * \latexonly $\lceil x \rceil$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo fence="false" stretchy="false">&#x2308;<!-- &Lceil --></m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo fence="false" stretchy="false">&#x2309;<!-- &Rceil --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ expressed as a floating-point number.
+ * - ceil(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - ceil(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl ceil(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Truncate input argument to the integral part.
+ *
+ * Round \p x to the nearest integer value that does not exceed \p x in 
+ * magnitude.
+ *
+ * \return 
+ * Returns truncated integer value.
+ * - trunc(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - trunc(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 trunc(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl trunc(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Truncate input argument to the integral part.
+ *
+ * Round \p x to the nearest integer value that does not exceed \p x in 
+ * magnitude.
+ *
+ * \return 
+ * Returns truncated integer value.
+ * - truncf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - truncf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  truncf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl truncf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * Compute the positive difference between \p x and \p y.  The positive
+ * difference is \p x - \p y when \p x > \p y and +0 otherwise.
+ *
+ * \return 
+ * Returns the positive difference between \p x and \p y.
+ * - fdim(\p x, \p y) returns \p x - \p y if \p x > \p y.
+ * - fdim(\p x, \p y) returns +0 if \p x 
+ * \latexonly $\leq$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2264;<!-- &Le --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly \p y.
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fdim(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fdim(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * Compute the positive difference between \p x and \p y.  The positive
+ * difference is \p x - \p y when \p x > \p y and +0 otherwise.
+ *
+ * \return 
+ * Returns the positive difference between \p x and \p y.
+ * - fdimf(\p x, \p y) returns \p x - \p y if \p x > \p y.
+ * - fdimf(\p x, \p y) returns +0 if \p x 
+ * \latexonly $\leq$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2264;<!-- &Le --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly \p y.
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fdimf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fdimf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc tangent of the ratio of first and second input arguments.
+ *
+ * Calculate the principal value of the arc tangent of the ratio of first
+ * and second input arguments \p y / \p x. The quadrant of the result is
+ * determined by the signs of inputs \p y and \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , +
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ].
+ * - atan2(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , -0) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - atan2(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , +0) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - atan2(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p x) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for \p x < 0.
+ * - atan2(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p x) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for \p x > 0.
+ * - atan2(\p y,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $-\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2 for \p y < 0.
+ * - atan2(\p y,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2 for \p y > 0.
+ * - atan2(
+ * \latexonly $\pm y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for finite \p y > 0.
+ * - atan2(
+ * \latexonly $\pm y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for finite \p y > 0.
+ * - atan2(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p x) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2 for finite \p x.
+ * - atan2(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm 3\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>3</m:mn>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /4.
+ * - atan2(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /4.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl atan2(double y, double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc tangent of the input argument.
+ *
+ * Calculate the principal value of the arc tangent of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2, +
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2].
+ * - atan(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - atan(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl atan(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc cosine of the input argument.
+ *
+ * Calculate the principal value of the arc cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [0, 
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ] for \p x inside [-1, +1].
+ * - acos(1) returns +0.
+ * - acos(\p x) returns NaN for \p x outside [-1, +1].
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl acos(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc sine of the input argument.
+ *
+ * Calculate the principal value of the arc sine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2, +
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2] for \p x inside [-1, +1].
+ * - asin(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - asin(\p x) returns NaN for \p x outside [-1, +1].
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl asin(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of two arguments.
+ *
+ * Calculate the length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns the length of the hypotenuse 
+ * \latexonly $\sqrt{x^2+y^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:mi>x</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>y</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - hypot(\p x,\p y), hypot(\p y,\p x), and hypot(\p x, \p -y) are equivalent.
+ * - hypot(\p x,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) is equivalent to fabs(\p x).
+ * - hypot(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,\p y) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * even if \p y is a NaN.
+ *
+ * \note_accuracy_double
+ */
+#if defined(_WIN32)
+#if defined(_MSC_VER) && _MSC_VER < 1900
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double __CRTDECL hypot(double x, double y);
+#else
+extern _ACRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double __cdecl hypot(double x, double y);
+#endif
+#else /* _WIN32 */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double           hypot(double x, double y) __THROW;
+#endif /* _WIN32 */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of two arguments.
+ *
+ * Calculate one over the length of the hypotenuse of a right triangle whose two sides have 
+ * lengths \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the hypotenuse 
+ * \latexonly $\frac{1}{\sqrt{x^2+y^2}}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:mi>x</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>y</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - rhypot(\p x,\p y), rhypot(\p y,\p x), and rhypot(\p x, \p -y) are equivalent.
+ * - rhypot(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,\p y) returns +0,
+ * even if \p y is a NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                rhypot(double x, double y) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of two arguments.
+ *
+ * Calculates the length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns the length of the hypotenuse 
+ * \latexonly $\sqrt{x^2+y^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:mi>x</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>y</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - hypotf(\p x,\p y), hypotf(\p y,\p x), and hypotf(\p x, \p -y) are equivalent.
+ * - hypotf(\p x,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) is equivalent to fabsf(\p x).
+ * - hypotf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,\p y) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * even if \p y is a NaN.
+ *
+ * \note_accuracy_single
+ */
+#if defined(_WIN32)
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __CRTDECL hypotf(float x, float y);
+#else /* _WIN32 */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float           hypotf(float x, float y) __THROW;
+#endif /* _WIN32 */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of two arguments.
+ *
+ * Calculates one over the length of the hypotenuse of a right triangle whose two sides have 
+ * lengths \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the hypotenuse 
+ * \latexonly $\frac{1}{\sqrt{x^2+y^2}}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:mi>x</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>y</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - rhypotf(\p x,\p y), rhypotf(\p y,\p x), and rhypotf(\p x, \p -y) are equivalent.
+ * - rhypotf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,\p y) returns +0,
+ * even if \p y is a NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                 rhypotf(float x, float y) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of three coordinates of the argument.
+ *
+ * Calculate the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of 3D vector
+ * \latexonly $\sqrt{a^2+b^2+c^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:mi>a</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>b</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>c</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl norm3d(double a, double b, double c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of three coordinates.
+ *
+ * Calculate one over the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector 
+ * \latexonly $\frac{1}{\sqrt{a^2+b^2+c^2}}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:mi>a</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>b</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>c</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                rnorm3d(double a, double b, double c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of four coordinates of the argument.
+ *
+ * Calculate the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of 4D vector
+ * \latexonly $\sqrt{a^2+b^2+c^2+d^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:mi>a</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>b</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>c</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>d</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl norm4d(double a, double b, double c, double d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of four coordinates.
+ *
+ * Calculate one over the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector 
+ * \latexonly $\frac{1}{\sqrt{a^2+b^2+c^2+d^2}}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:mi></m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>b</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>c</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *	       <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>d</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double rnorm4d(double a, double b, double c, double d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of any number of coordinates.
+ *
+ * Calculate the length of a vector p, dimension of which is passed as an argument \p without undue overflow or underflow.
+ *
+ * \return Returns the length of the dim-D vector 
+ * \latexonly $\sqrt{\sum_{i=0}^{dim-1} p_i^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:msub>
+ *         <m:mi>p</m:mi>
+ *         <m:mn>0</m:mn>
+ *       </m:msub>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:msub>
+ *         <m:mi>p</m:mi>
+ *         <m:mn>1</m:mn>
+ *       </m:msub>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+ ... +</m:mo>
+ *     <m:msup>
+ *       <m:msub>
+ *         <m:mi>p</m:mi>
+ *         <m:mrow>
+ *           <m:mi>dim</m:mi>
+ *           <m:mo>-</m:mo>
+ *           <m:mn>1</m:mn>
+ *         </m:mrow>
+ *       </m:msub>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_double
+ */
+ __device__ __device_builtin__  double norm(int dim, double const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the reciprocal of square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates one over the length of vector \p p, dimension of which is passed as an argument, in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the vector
+ * \latexonly $\frac{1}{\sqrt{\sum_{i=0}^{dim-1} p_i^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:msub>
+ *             <m:mi>p</m:mi>
+ *             <m:mn>0</m:mn>
+ *           </m:msub>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:msub>
+ *             <m:mi>p</m:mi>
+ *             <m:mn>1</m:mn>
+ *           </m:msub>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+ ... +</m:mo>
+ *         <m:msup>
+ *           <m:msub>
+ *             <m:mi>p</m:mi>
+ *             <m:mrow>
+ *               <m:mi>dim</m:mi>
+ *               <m:mo>-</m:mo>
+ *               <m:mn>1</m:mn>
+ *             </m:mrow>
+ *           </m:msub>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double rnorm(int dim, double const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the reciprocal of square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates one over the length of vector \p p, dimension of which is passed as an argument, in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the vector
+ * \latexonly $\frac{1}{\sqrt{\sum_{i=0}^{dim-1} p_i^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:msub>
+ *             <m:mi>p</m:mi>
+ *             <m:mn>0</m:mn>
+ *           </m:msub>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:msub>
+ *             <m:mi>p</m:mi>
+ *             <m:mn>1</m:mn>
+ *           </m:msub>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+ ... +</m:mo>
+ *         <m:msup>
+ *           <m:msub>
+ *             <m:mi>p</m:mi>
+ *             <m:mrow>
+ *               <m:mi>dim</m:mi>
+ *               <m:mo>-</m:mo>
+ *               <m:mn>1</m:mn>
+ *             </m:mrow>
+ *           </m:msub>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_single
+ */
+
+extern __device__ __device_builtin__ float rnormf(int dim, float const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates the length of a vector \p p, dimension of which is passed as an argument without undue overflow or underflow.
+ *
+ * \return Returns the length of the dim-D vector 
+ * \latexonly $\sqrt{\sum_{i=0}^{dim-1} p_i^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:msub>
+ *         <m:mi>p</m:mi>
+ *         <m:mn>0</m:mn>
+ *       </m:msub>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:msub>
+ *         <m:mi>p</m:mi>
+ *         <m:mn>1</m:mn>
+ *       </m:msub>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+ ... +</m:mo>
+ *     <m:msup>
+ *       <m:msub>
+ *         <m:mi>p</m:mi>
+ *         <m:mrow>
+ *           <m:mi>dim</m:mi>
+ *           <m:mo>-</m:mo>
+ *           <m:mn>1</m:mn>
+ *         </m:mrow>
+ *       </m:msub>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_single
+ */
+ __device__ __device_builtin__  float normf(int dim, float const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of three coordinates of the argument.
+ *
+ * Calculates the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of the 3D vector 
+ * \latexonly $\sqrt{a^2+b^2+c^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:mi>a</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>b</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>c</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_single
+ */
+
+extern __device__ __device_builtin__ float norm3df(float a, float b, float c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of three coordinates.
+ *
+ * Calculates one over the length of three dimension vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector
+ * \latexonly $\frac{1}{\sqrt{a^2+b^2+c^2}}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:mi>a</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>b</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>c</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float rnorm3df(float a, float b, float c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of four coordinates of the argument.
+ *
+ * Calculates the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of the 4D vector
+ * \latexonly $\sqrt{a^2+b^2+c^2+d^2}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:msup>
+ *       <m:mi>a</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>b</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>c</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *     <m:mo>+</m:mo>
+ *     <m:msup>
+ *       <m:mi>d</m:mi>
+ *       <m:mn>2</m:mn>
+ *     </m:msup>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float norm4df(float a, float b, float c, float d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of four coordinates.
+ *
+ * Calculates one over the length of four dimension vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector
+ * \latexonly $\frac{1}{\sqrt{a^2+b^2+c^2+d^2}}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mrow>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *     <m:mrow>
+ *       <m:msqrt>
+ *         <m:msup>
+ *           <m:mi>a</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>b</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>c</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *         <m:mo>+</m:mo>
+ *         <m:msup>
+ *           <m:mi>d</m:mi>
+ *           <m:mn>2</m:mn>
+ *         </m:msup>
+ *       </m:msqrt>
+ *     </m:mrow>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * - In the presence of an exactly infinite coordinate
+ * \latexonly $+0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * is returned, even if there are NaNs.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float rnorm4df(float a, float b, float c, float d) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cube root of the input argument.
+ *
+ * Calculate the cube root of \p x, 
+ * \latexonly $x^{1/3}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>x</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>1</m:mn>
+ *       <m:mrow class="MJX-TeXAtom-ORD">
+ *         <m:mo>/</m:mo>
+ *       </m:mrow>
+ *       <m:mn>3</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * Returns 
+ * \latexonly $x^{1/3}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>x</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>1</m:mn>
+ *       <m:mrow class="MJX-TeXAtom-ORD">
+ *         <m:mo>/</m:mo>
+ *       </m:mrow>
+ *       <m:mn>3</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - cbrt(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - cbrt(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cbrt(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cbrt(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cube root of the input argument.
+ *
+ * Calculate the cube root of \p x, 
+ * \latexonly $x^{1/3}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>x</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>1</m:mn>
+ *       <m:mrow class="MJX-TeXAtom-ORD">
+ *         <m:mo>/</m:mo>
+ *       </m:mrow>
+ *       <m:mn>3</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * Returns 
+ * \latexonly $x^{1/3}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>x</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>1</m:mn>
+ *       <m:mrow class="MJX-TeXAtom-ORD">
+ *         <m:mo>/</m:mo>
+ *       </m:mrow>
+ *       <m:mn>3</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - cbrtf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - cbrtf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cbrtf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl cbrtf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate reciprocal cube root function.
+ *
+ * Calculate reciprocal cube root function of \p x.
+ *
+ * \return 
+ * - rcbrt(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rcbrt(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rcbrt(double x);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate reciprocal cube root function.
+ *
+ * Calculate reciprocal cube root function of \p x.
+ *
+ * \return 
+ * - rcbrt(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - rcbrt(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rcbrtf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine of the input argument 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the sine of \p x
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - sinpi(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sinpi(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine of the input argument 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the sine of \p x
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - sinpif(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sinpif(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cosine of the input argument 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the cosine of \p x
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - cospi(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - cospi(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cosine of the input argument 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the cosine of \p x
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  (measured in radians),
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - cospif(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - cospif(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief  Calculate the sine and cosine of the first input argument 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the sine and cosine of the first input argument, \p x (measured in radians), 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.  The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \return 
+ * - none
+ *
+ * \see ::sinpi() and ::cospi().
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincospi(double x, double *sptr, double *cptr);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief  Calculate the sine and cosine of the first input argument 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * Calculate the sine and cosine of the first input argument, \p x (measured in radians), 
+ * \latexonly $\times \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.  The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \return 
+ * - none
+ *
+ * \see ::sinpif() and ::cospif().
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincospif(float x, float *sptr, float *cptr);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of first argument to the power of second argument.
+ *
+ * Calculate the value of \p x to the power of \p y.
+ *
+ * \return 
+ * - pow(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y an odd integer less than 0.
+ * - pow(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+*   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y less than 0 and not an odd integer.
+ * - pow(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y an odd integer greater than 0.
+ * - pow(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - pow(-1, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - pow(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - pow(\p x, 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1 for any \p x, even a NaN.
+ * - pow(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - pow(\p x, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for 
+ * \latexonly $| x | < 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - pow(\p x, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0 for 
+ * \latexonly $| x | > 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&gt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - pow(\p x, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0 for 
+ * \latexonly $| x | < 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - pow(\p x, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for 
+ * \latexonly $| x | > 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&gt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - pow(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - pow(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - pow(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y an odd integer greater than 0.
+ * - pow(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y > 0 and not an odd integer.
+ * - pow(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns +0 for \p y < 0.
+ * - pow(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y > 0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl pow(double x, double y) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The 
+ * integral part is stored in the argument \p iptr.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - modf(
+ * \latexonly $\pm x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *  <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>x</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - modf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p iptr) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  and stores 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *   in the object pointed to by \p iptr.
+ * - modf(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl modf(double x, double *iptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the double-precision floating-point remainder of \p x / \p y.
+ *
+ * Calculate the double-precision floating-point remainder of \p x / \p y.
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ *
+ * \return
+ * - Returns the floating-point remainder of \p x / \p y.
+ * - fmod(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  if \p y is not zero.
+ * - fmod(\p x, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns \p x if \p x is finite.
+ * - fmod(\p x, \p y) returns NaN if \p x is 
+ * \latexonly $\pm\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl fmod(double x, double y) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute double-precision floating-point remainder.
+ *
+ * Compute double-precision floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y. Thus 
+ * \latexonly $ r = x - n y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>r</m:mi>
+ *   <m:mo>=</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>n</m:mi>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * The value \p n is the integer value nearest 
+ * \latexonly $ \frac{x}{y} $ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * In the case when 
+ * \latexonly $ | n -\frac{x}{y} | = \frac{1}{2} $ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>n</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mfrac>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:mfrac>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>=</m:mo>
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mn>2</m:mn>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , the
+ * even \p n value is chosen.
+ *
+ * \return 
+ * - remainder(\p x,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ * - remainder(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns NaN.
+ * - remainder(\p x, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns \p x for finite \p x.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 remainder(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl remainder(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute single-precision floating-point remainder.
+ *
+ * Compute single-precision floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y. Thus 
+ * \latexonly $ r = x - n y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>r</m:mi>
+ *   <m:mo>=</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi>n</m:mi>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * The value \p n is the integer value nearest 
+ * \latexonly $ \frac{x}{y} $ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. 
+ * In the case when 
+ * \latexonly $ | n -\frac{x}{y} | = \frac{1}{2} $ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>n</m:mi>
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mfrac>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:mfrac>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>=</m:mo>
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mn>2</m:mn>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , the
+ * even \p n value is chosen.
+ *
+ * \return 
+ * - remainderf(\p x,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ * - remainderf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns NaN.
+ * - remainderf(\p x, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns \p x for finite \p x.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  remainderf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl remainderf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute double-precision floating-point remainder and part of quotient.
+ *
+ * Compute a double-precision floating-point remainder in the same way as the
+ * ::remainder() function. Argument \p quo returns part of quotient upon 
+ * division of \p x by \p y. Value \p quo has the same sign as 
+ * \latexonly $ \frac{x}{y} $ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * and may not be the exact quotient but agrees with the exact quotient
+ * in the low order 3 bits.
+ *
+ * \return 
+ * Returns the remainder.
+ * - remquo(\p x,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquo(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquo(\p x, \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points if either of \p x or \p y is NaN.
+ * - remquo(\p x, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p quo) returns \p x and stores zero
+ * in the location to which \p quo points for finite \p x.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 remquo(double x, double y, int *quo) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl remquo(double x, double y, int *quo);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute single-precision floating-point remainder and part of quotient.
+ *
+ * Compute a single-precision floating-point remainder in the same way as the 
+ * ::remainderf() function. Argument \p quo returns part of quotient upon 
+ * division of \p x by \p y. Value \p quo has the same sign as 
+ * \latexonly $ \frac{x}{y} $ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mi>x</m:mi>
+ *     <m:mi>y</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * and may not be the exact quotient but agrees with the exact quotient
+ * in the low order 3 bits.
+ *
+ * \return 
+ * Returns the remainder.
+ * - remquof(\p x,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquof(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquof(\p x, \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points if either of \p x or \p y is NaN.
+ * - remquof(\p x, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p quo) returns \p x and stores zero
+ * in the location to which \p quo points for finite \p x.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  remquof(float x, float y, int *quo) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl remquof(float x, float y, int *quo);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 0 for
+ * the input argument \p x, 
+ * \latexonly $J_0(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>J</m:mi>
+ *     <m:mn>0</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 0.
+ * - j0(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - j0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl j0(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 0 for
+ * the input argument \p x, 
+ * \latexonly $J_0(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>J</m:mi>
+ *     <m:mn>0</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 0.
+ * - j0f(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - j0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  j0f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 1 for
+ * the input argument \p x, 
+ * \latexonly $J_1(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>J</m:mi>
+ *     <m:mn>1</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 1.
+ * - j1(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - j1(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - j1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl j1(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 1 for
+ * the input argument \p x, 
+ * \latexonly $J_1(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>J</m:mi>
+ *     <m:mn>1</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 1.
+ * - j1f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - j1f(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - j1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  j1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order \p n for
+ * the input argument \p x, 
+ * \latexonly $J_n(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>J</m:mi>
+ *     <m:mi>n</m:mi>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order \p n.
+ * - jn(\p n, NaN) returns NaN.
+ * - jn(\p n, \p x) returns NaN for \p n < 0.
+ * - jn(\p n, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl jn(int n, double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order \p n for
+ * the input argument \p x, 
+ * \latexonly $J_n(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>J</m:mi>
+ *     <m:mi>n</m:mi>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order \p n.
+ * - jnf(\p n, NaN) returns NaN.
+ * - jnf(\p n, \p x) returns NaN for \p n < 0.
+ * - jnf(\p n, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  jnf(int n, float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 0 for
+ * the input argument \p x, 
+ * \latexonly $Y_0(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>Y</m:mi>
+ *     <m:mn>0</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 0.
+ * - y0(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - y0(\p x) returns NaN for \p x < 0.
+ * - y0(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - y0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl y0(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 0 for
+ * the input argument \p x, 
+ * \latexonly $Y_0(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>Y</m:mi>
+ *     <m:mn>0</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 0.
+ * - y0f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - y0f(\p x) returns NaN for \p x < 0.
+ * - y0f(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - y0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  y0f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 1 for
+ * the input argument \p x, 
+ * \latexonly $Y_1(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>Y</m:mi>
+ *     <m:mn>1</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 1.
+ * - y1(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - y1(\p x) returns NaN for \p x < 0.
+ * - y1(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - y1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl y1(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 1 for
+ * the input argument \p x, 
+ * \latexonly $Y_1(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>Y</m:mi>
+ *     <m:mn>1</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 1.
+ * - y1f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - y1f(\p x) returns NaN for \p x < 0.
+ * - y1f(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - y1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  y1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order \p n for
+ * the input argument \p x, 
+ * \latexonly $Y_n(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>Y</m:mi>
+ *     <m:mi>n</m:mi>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order \p n.
+ * - yn(\p n, \p x) returns NaN for \p n < 0.
+ * - yn(\p n, 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - yn(\p n, \p x) returns NaN for \p x < 0.
+ * - yn(\p n, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - yn(\p n, NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl yn(int n, double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order \p n for
+ * the input argument \p x, 
+ * \latexonly $Y_n(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>Y</m:mi>
+ *     <m:mi>n</m:mi>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order \p n.
+ * - ynf(\p n, \p x) returns NaN for \p n < 0.
+ * - ynf(\p n, 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - ynf(\p n, \p x) returns NaN for \p x < 0.
+ * - ynf(\p n, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ * - ynf(\p n, NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ynf(int n, float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 0 for
+ * the input argument \p x, 
+ * \latexonly $I_0(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>I</m:mi>
+ *     <m:mn>0</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 0.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cyl_bessel_i0(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 0 for
+ * the input argument \p x, 
+ * \latexonly $I_0(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>I</m:mi>
+ *     <m:mn>0</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 0.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                  cyl_bessel_i0f(float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 1 for
+ * the input argument \p x, 
+ * \latexonly $I_1(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>I</m:mi>
+ *     <m:mn>1</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 1.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cyl_bessel_i1(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 1 for
+ * the input argument \p x, 
+ * \latexonly $I_1(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mi>I</m:mi>
+ *     <m:mn>1</m:mn>
+ *   </m:msub>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 1.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                  cyl_bessel_i1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the error function of the input argument.
+ *
+ * Calculate the value of the error function for the input argument \p x,
+ * \latexonly $\frac{2}{\sqrt \pi} \int_0^x e^{-t^2} dt$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>2</m:mn>
+ *     <m:msqrt>
+ *       <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ *     </m:msqrt>
+ *   </m:mfrac>
+ *   <m:msubsup>
+ *     <m:mo>&#x222B;<!-- &Int --></m:mo>
+ *     <m:mn>0</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msubsup>
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:msup>
+ *         <m:mi>t</m:mi>
+ *         <m:mn>2</m:mn>
+ *       </m:msup>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mi>d</m:mi>
+ *   <m:mi>t</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - erf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erf(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl erf(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the error function of the input argument.
+ *
+ * Calculate the value of the error function for the input argument \p x,
+ * \latexonly $\frac{2}{\sqrt \pi} \int_0^x e^{-t^2} dt$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>2</m:mn>
+ *     <m:msqrt>
+ *       <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ *     </m:msqrt>
+ *   </m:mfrac>
+ *   <m:msubsup>
+ *     <m:mo>&#x222B;<!-- &Int --></m:mo>
+ *     <m:mn>0</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:msubsup>
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:msup>
+ *         <m:mi>t</m:mi>
+ *         <m:mn>2</m:mn>
+ *       </m:msup>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mi>d</m:mi>
+ *   <m:mi>t</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return  
+ * - erff(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erff(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erff(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl erff(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse error function of the input argument.
+ *
+ * Calculate the inverse error function
+ * \latexonly $\erf^{-1}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>erf</m:mi>
+ *     <m:mrow>
+ *       <m:mn>-</m:mn>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * (\p x), of the input argument \p x in the interval [-1, 1].
+ *
+ * \return
+ * - erfinv(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - erfinv(1) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfinv(-1) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfinv(\p x) returns NaN for \p x outside [-1, +1].
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse error function of the input argument.
+ *
+ * Calculate the inverse error function
+ * \latexonly $\erf^{-1}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>erf</m:mi>
+ *     <m:mrow>
+ *       <m:mn>-</m:mn>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * (\p x), of the input argument \p x in the interval [-1, 1].
+ *
+ * \return 
+ * - erfinvf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - erfinvf(1) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfinvf(-1) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfinvf(\p x) returns NaN for \p x outside [-1, +1].
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfinvf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the complementary error function of the input argument.
+ *
+ * Calculate the complementary error function of the input argument \p x,
+ * 1 - erf(\p x).
+ *
+ * \return 
+ * - erfc(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 2.
+ * - erfc(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfc(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl erfc(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the complementary error function of the input argument.
+ *
+ * Calculate the complementary error function of the input argument \p x,
+ * 1 - erf(\p x).
+ *
+ * \return 
+ * - erfcf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 2.
+ * - erfcf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl erfcf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the natural logarithm of the absolute value of the gamma function of the input argument.
+ *
+ * Calculate the natural logarithm of the absolute value of the gamma function of the input argument \p x, namely the value of
+ * \latexonly $\log_{e}\left|\int_{0}^{\infty} e^{-t}t^{x-1}dt\right|$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mo>log</m:mo>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *     </m:mrow>
+ *   </m:msub>
+ *   <m:mfenced open="|" close="|">
+ *     <m:mrow>
+ *       <m:msubsup>
+ *         <m:mo>&#x222B;<!-- &Int --></m:mo>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mn>0</m:mn>
+ *         </m:mrow>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ *         </m:mrow>
+ *       </m:msubsup>
+ *       <m:msup>
+ *         <m:mi>e</m:mi>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *           <m:mi>t</m:mi>
+ *         </m:mrow>
+ *       </m:msup>
+ *       <m:msup>
+ *         <m:mi>t</m:mi>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mi>x</m:mi>
+ *           <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *           <m:mn>1</m:mn>
+ *         </m:mrow>
+ *       </m:msup>
+ *       <m:mi>d</m:mi>
+ *       <m:mi>t</m:mi>
+ *     </m:mrow>
+ *   </m:mfenced>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *
+ * \return 
+ * - lgamma(1) returns +0.
+ * - lgamma(2) returns +0.
+ * - lgamma(\p x) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  if \p x 
+ * \latexonly $\leq$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2264;<!-- &Le --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly 0 and \p x is an integer.
+ * - lgamma(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - lgamma(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 lgamma(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl lgamma(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse complementary error function of the input argument.
+ *
+ * Calculate the inverse complementary error function
+ * \latexonly $\erfc^{-1}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>erfc</m:mi>
+ *     <m:mrow>
+ *       <m:mn>-</m:mn>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * (\p x), of the input argument \p x in the interval [0, 2].
+ *
+ * \return 
+ * - erfcinv(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfcinv(2) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfcinv(\p x) returns NaN for \p x outside [0, 2].
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfcinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse complementary error function of the input argument.
+ *
+ * Calculate the inverse complementary error function
+ * \latexonly $\erfc^{-1}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>erfc</m:mi>
+ *     <m:mrow>
+ *       <m:mn>-</m:mn>
+ *       <m:mi>1</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * (\p x), of the input argument \p x in the interval [0, 2].
+ *
+ * \return 
+ * - erfcinvf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfcinvf(2) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - erfcinvf(\p x) returns NaN for \p x outside [0, 2].
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcinvf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse of the standard normal cumulative distribution function.
+ *
+ * Calculate the inverse of the standard normal cumulative distribution function for input argument \p x,
+ * \latexonly $\Phi^{-1}(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi mathvariant="normal">&#x03A6;<!-- &Phi --></m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:mn>1</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. The function is defined for input values in the interval 
+ * \latexonly $(0, 1)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mn>0</m:mn>
+ *   <m:mo>,</m:mo>
+ *   <m:mn>1</m:mn>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - normcdfinv(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - normcdfinv(1) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - normcdfinv(\p x) returns NaN
+ *  if \p x is not in the interval [0,1].
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 normcdfinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse of the standard normal cumulative distribution function.
+ *
+ * Calculate the inverse of the standard normal cumulative distribution function for input argument \p x,
+ * \latexonly $\Phi^{-1}(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi mathvariant="normal">&#x03A6;<!-- &Phi --></m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:mn>1</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly. The function is defined for input values in the interval 
+ * \latexonly $(0, 1)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mn>0</m:mn>
+ *   <m:mo>,</m:mo>
+ *   <m:mn>1</m:mn>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - normcdfinvf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - normcdfinvf(1) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - normcdfinvf(\p x) returns NaN
+ *  if \p x is not in the interval [0,1].
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  normcdfinvf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the standard normal cumulative distribution function.
+ *
+ * Calculate the cumulative distribution function of the standard normal distribution for input argument \p x,
+ * \latexonly $\Phi(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi mathvariant="normal">&#x03A6;<!-- &Phi --></m:mi>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - normcdf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - normcdf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> 
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 normcdf(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the standard normal cumulative distribution function.
+ *
+ * Calculate the cumulative distribution function of the standard normal distribution for input argument \p x,
+ * \latexonly $\Phi(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi mathvariant="normal">&#x03A6;<!-- &Phi --></m:mi>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - normcdff(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - normcdff(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> 
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0
+
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  normcdff(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the scaled complementary error function of the input argument.
+ *
+ * Calculate the scaled complementary error function of the input argument \p x,
+ * \latexonly $e^{x^2}\cdot \textrm{erfc}(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:msup>
+ *         <m:mi>x</m:mi>
+ *         <m:mn>2</m:mn>
+ *       </m:msup>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mtext>erfc</m:mtext>
+ *   </m:mrow>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - erfcx(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - erfcx(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> 
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfcx(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the scaled complementary error function of the input argument.
+ *
+ * Calculate the scaled complementary error function of the input argument \p x,
+ * \latexonly $e^{x^2}\cdot \textrm{erfc}(x)$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:msup>
+ *         <m:mi>x</m:mi>
+ *         <m:mn>2</m:mn>
+ *       </m:msup>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mo>&#x22C5;<!-- &Sdot --></m:mo>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mtext>erfc</m:mtext>
+ *   </m:mrow>
+ *   <m:mo stretchy="false">(</m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo stretchy="false">)</m:mo>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - erfcxf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - erfcxf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> 
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcxf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the natural logarithm of the absolute value of the gamma function of the input argument.
+ *
+ * Calculate the natural logarithm of the absolute value of the gamma function of the input argument \p x, namely the value of
+ * \latexonly $\log_{e}\left|\int_{0}^{\infty} e^{-t}t^{x-1}dt\right|$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msub>
+ *     <m:mo>log</m:mo>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>e</m:mi>
+ *     </m:mrow>
+ *   </m:msub>
+ *   <m:mfenced open="|" close="|">
+ *     <m:mrow>
+ *       <m:msubsup>
+ *         <m:mo>&#x222B;<!-- &Int --></m:mo>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mn>0</m:mn>
+ *         </m:mrow>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ *         </m:mrow>
+ *       </m:msubsup>
+ *       <m:msup>
+ *         <m:mi>e</m:mi>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *           <m:mi>t</m:mi>
+ *         </m:mrow>
+ *       </m:msup>
+ *       <m:msup>
+ *         <m:mi>t</m:mi>
+ *         <m:mrow class="MJX-TeXAtom-ORD">
+ *           <m:mi>x</m:mi>
+ *           <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *           <m:mn>1</m:mn>
+ *         </m:mrow>
+ *       </m:msup>
+ *       <m:mi>d</m:mi>
+ *       <m:mi>t</m:mi>
+ *     </m:mrow>
+ *   </m:mfenced>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *
+ * \return 
+ * - lgammaf(1) returns +0.
+ * - lgammaf(2) returns +0.
+ * - lgammaf(\p x) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  if \p x
+ * \latexonly $\leq$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2264;<!-- &Le --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  0 and \p x is an integer.
+ * - lgammaf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - lgammaf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  lgammaf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl lgammaf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the gamma function of the input argument.
+ *
+ * Calculate the gamma function of the input argument \p x, namely the value of
+ * \latexonly $\int_{0}^{\infty} e^{-t}t^{x-1}dt$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msubsup>
+ *     <m:mo>&#x222B;<!-- &Int --></m:mo>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>0</m:mn>
+ *     </m:mrow>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ *     </m:mrow>
+ *   </m:msubsup>
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:mi>t</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:msup>
+ *     <m:mi>t</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>x</m:mi>
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:mn>1</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mi>d</m:mi>
+ *   <m:mi>t</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - tgamma(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - tgamma(2) returns +1.
+ * - tgamma(\p x) returns NaN if \p x < 0 and \p x is an integer.
+ * - tgamma(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ * - tgamma(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 tgamma(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl tgamma(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the gamma function of the input argument.
+ *
+ * Calculate the gamma function of the input argument \p x, namely the value of
+ * \latexonly $\int_{0}^{\infty} e^{-t}t^{x-1}dt$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msubsup>
+ *     <m:mo>&#x222B;<!-- &Int --></m:mo>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mn>0</m:mn>
+ *     </m:mrow>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ *     </m:mrow>
+ *   </m:msubsup>
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:mi>t</m:mi>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:msup>
+ *     <m:mi>t</m:mi>
+ *     <m:mrow class="MJX-TeXAtom-ORD">
+ *       <m:mi>x</m:mi>
+ *       <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *       <m:mn>1</m:mn>
+ *     </m:mrow>
+ *   </m:msup>
+ *   <m:mi>d</m:mi>
+ *   <m:mi>t</m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * - tgammaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - tgammaf(2) returns +1.
+ * - tgammaf(\p x) returns NaN if \p x < 0  and \p x is an integer.
+ * - tgammaf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ * - tgammaf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tgammaf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl tgammaf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/** \ingroup CUDA_MATH_DOUBLE
+ * \brief Create value with given magnitude, copying sign of second value.
+ *
+ * Create a floating-point value with the magnitude \p x and the sign of \p y.
+ *
+ * \return
+ * Returns a value with the magnitude of \p x and the sign of \p y.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 copysign(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl copysign(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/** \ingroup CUDA_MATH_SINGLE
+ * \brief Create value with given magnitude, copying sign of second value.
+ *
+ * Create a floating-point value with the magnitude \p x and the sign of \p y.
+ *
+ * \return
+ * Returns a value with the magnitude of \p x and the sign of \p y.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  copysignf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl copysignf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Return next representable double-precision floating-point value after argument \p x in the direction of \p y.
+ *
+ * Calculate the next representable double-precision floating-point value
+ * following \p x in the direction of \p y. For example, if \p y is greater than \p x, ::nextafter()
+ * returns the smallest representable number greater than \p x
+ *
+ * \return 
+ * - nextafter(\p x, \p y) = \p y if \p x equals \p y.
+ * - nextafter(\p x, \p y) = \p NaN if either \p x or \p y are \p NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nextafter(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nextafter(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Return next representable single-precision floating-point value after argument \p x in the direction of \p y.
+ *
+ * Calculate the next representable single-precision floating-point value
+ * following \p x in the direction of \p y. For example, if \p y is greater than \p x, ::nextafterf()
+ * returns the smallest representable number greater than \p x
+ *
+ * \return 
+ * - nextafterf(\p x, \p y) = \p y if \p x equals \p y.
+ * - nextafterf(\p x, \p y) = \p NaN if either \p x or \p y are \p NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nextafterf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nextafterf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Returns "Not a Number" value.
+ *
+ * Return a representation of a quiet NaN. Argument \p tagp selects one of the possible representations.
+ *
+ * \return 
+ * - nan(\p tagp) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nan(const char *tagp) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nan(const char *tagp);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Returns "Not a Number" value
+ *
+ * Return a representation of a quiet NaN. Argument \p tagp selects one of the possible representations.
+ *
+ * \return 
+ * - nanf(\p tagp) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nanf(const char *tagp) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nanf(const char *tagp);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* namespace std */
+#endif
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinff(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnanf(float) __THROW;
+
+
+#if defined(__APPLE__)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinited(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinitef(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitd(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnand(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinfd(double) __THROW;
+#else /* __APPLE__ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finite(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finitef(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbit(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnan(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinf(double) __THROW;
+#endif /* __APPLE__ */
+
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitf(float) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ *
+ * Compute the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation. After computing the value
+ * to infinite precision, the value is rounded once.
+ *
+ * \return
+ * Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fma(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fma(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fma(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fma(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fma(double x, double y, double z) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fma(double x, double y, double z);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ *
+ * Compute the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation. After computing the value
+ * to infinite precision, the value is rounded once.
+ *
+ * \return
+ * Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if 
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Times --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmaf(float x, float y, float z) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fmaf(float x, float y, float z);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+
+/* these are here to avoid warnings on the call graph.
+   long double is not supported on the device */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitl(long double) __THROW;
+#if defined(__APPLE__)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinite(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinf(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnan(long double) __THROW;
+#else /* __APPLE__ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finitel(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinfl(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnanl(long double) __THROW;
+#endif /* __APPLE__ */
+
+#if defined(_WIN32) && defined(_M_AMD64)
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl acosf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl asinf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl atanf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl atan2f(float, float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl cosf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sinf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl tanf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl coshf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sinhf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl tanhf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl expf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl logf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl log10f(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl modff(float, float*) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl powf(float, float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sqrtf(float) __THROW;         
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl ceilf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl floorf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl fmodf(float, float) __THROW;
+#else /* _WIN32 && _M_AMD64 */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc cosine of the input argument.
+ *
+ * Calculate the principal value of the arc cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [0, 
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ] for \p x inside [-1, +1].
+ * - acosf(1) returns +0.
+ * - acosf(\p x) returns NaN for \p x outside [-1, +1].
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  acosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc sine of the input argument.
+ *
+ * Calculate the principal value of the arc sine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \latexonly $\pi/2$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:mn>2</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , +
+ * \latexonly $\pi/2$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:mn>2</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ] for \p x inside [-1, +1].
+ * - asinf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - asinf(\p x) returns NaN for \p x outside [-1, +1].
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  asinf(float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc tangent of the input argument.
+ *
+ * Calculate the principal value of the arc tangent of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \latexonly $\pi/2$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:mn>2</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , +
+ * \latexonly $\pi/2$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo>/</m:mo>
+ *   </m:mrow>
+ *   <m:mn>2</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ].
+ * - atanf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - atanf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc tangent of the ratio of first and second input arguments.
+ *
+ * Calculate the principal value of the arc tangent of the ratio of first
+ * and second input arguments \p y / \p x. The quadrant of the result is 
+ * determined by the signs of inputs \p y and \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , +
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ].
+ * - atan2f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , -0) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - atan2f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , +0) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ * - atan2f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p x) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for \p x < 0.
+ * - atan2f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p x) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for \p x > 0.
+ * - atan2f(\p y,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $-\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2 for \p y < 0.
+ * - atan2f(\p y,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2 for \p y > 0.
+ * - atan2f(
+ * \latexonly $\pm y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for finite \p y > 0.
+ * - atan2f(
+ * \latexonly $\pm y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * for finite \p y > 0.
+ * - atan2f(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p x) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /2 for finite \p x.
+ * - atan2f(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm 3\pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>3</m:mn>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /4.
+ * - atan2f(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $\pm \pi$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>&#x03C0;<!-- &Pi --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * /4.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atan2f(float y, float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cosine of the input argument.
+ *
+ * Calculate the cosine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - cosf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - cosf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine of the input argument.
+ *
+ * Calculate the sine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - sinf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sinf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the tangent of the input argument.
+ *
+ * Calculate the tangent of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - tanf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - tanf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic cosine of the input argument.
+ *
+ * Calculate the hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * - coshf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - coshf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  coshf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic sine of the input argument.
+ *
+ * Calculate the hyperbolic sine of the input argument \p x.
+ *
+ * \return 
+ * - sinhf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sinhf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic tangent of the input argument.
+ *
+ * Calculate the hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - tanhf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - tanhf( 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns
+ * \latexonly $\pm 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tanhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the natural logarithm of the input argument.
+ *
+ * Calculate the natural logarithm of the input argument \p x.
+ *
+ * \return 
+ * - logf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - logf(1) returns +0.
+ * - logf(\p x) returns NaN for \p x < 0.
+ * - logf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  logf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument.
+ *
+ * Calculate
+ * \latexonly $e^x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msup>
+ *     <m:mi>e</m:mi>
+ *     <m:mi>x</m:mi>
+ *   </m:msup>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly,
+ * the base 
+ * \latexonly $e$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>e</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  exponential of the input argument \p x.
+ *
+ * \return
+ * - expf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - expf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns +0.
+ * - expf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly
+ * ) returns
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  expf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 10 logarithm of the input argument.
+ *
+ * Calculate the base 10 logarithm of the input argument \p x.
+ *
+ * \return 
+ * - log10f(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - log10f(1) returns +0.
+ * - log10f(\p x) returns NaN for \p x < 0.
+ * - log10f(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The integral part is stored in the argument \p iptr.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - modff(
+ * \latexonly $\pm x$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *  <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi>x</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - modff(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p iptr) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  and stores 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *   in the object pointed to by \p iptr.
+ * - modff(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  modff(float x, float *iptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of first argument to the power of second argument.
+ *
+ * Calculate the value of \p x to the power of \p y.
+ *
+ * \return 
+ * - powf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y an odd integer less than 0.
+ * - powf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y less than 0 and not an odd integer.
+ * - powf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y an odd integer greater than 0.
+ * - powf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - powf(-1, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1.
+ * - powf(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - powf(\p x, 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 1 for any \p x, even a NaN.
+ * - powf(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - powf(\p x, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for 
+ * \latexonly $| x | < 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - powf(\p x, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0 for 
+ * \latexonly $| x | > 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&gt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - powf(\p x, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns +0 for 
+ * \latexonly $| x | < 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&lt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - powf(\p x, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for 
+ * \latexonly $| x | > 1$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mi>x</m:mi>
+ *   <m:mrow class="MJX-TeXAtom-ORD">
+ *     <m:mo stretchy="false">|</m:mo>
+ *   </m:mrow>
+ *   <m:mo>&gt;</m:mo>
+ *   <m:mn>1</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - powf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - powf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - powf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y an odd integer greater than 0.
+ * - powf(
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x2212;<!-- &Minus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y > 0 and not an odd integer.
+ * - powf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns +0 for \p y < 0.
+ * - powf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  for \p y > 0.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  powf(float x, float y) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the input argument.
+ *
+ * Calculate the nonnegative square root of \p x, 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \return 
+ * Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sqrtf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sqrtf(
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - sqrtf(\p x) returns NaN if \p x is less than 0.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sqrtf(float x) __THROW;         
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate ceiling of the input argument.
+ *
+ * Compute the smallest integer value not less than \p x.
+ *
+ * \return
+ * Returns 
+ * \latexonly $\lceil x \rceil$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo fence="false" stretchy="false">&#x2308;<!-- &Lceil --></m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo fence="false" stretchy="false">&#x2309;<!-- &Rceil --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  expressed as a floating-point number.
+ * - ceilf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - ceilf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ceilf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the largest integer less than or equal to \p x.
+ * 
+ * Calculate the largest integer value which is less than or equal to \p x.
+ * 
+ * \return
+ * Returns 
+ * \latexonly $\lfloor x \rfloor$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo fence="false" stretchy="false">&#x230A;<!-- &Lfloor --></m:mo>
+ *   <m:mi>x</m:mi>
+ *   <m:mo fence="false" stretchy="false">&#x230B;<!-- &Rfloor --></m:mo>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  expressed as a floating-point number.
+ * - floorf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ * - floorf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  floorf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the floating-point remainder of \p x / \p y.
+ *
+ * Calculate the floating-point remainder of \p x / \p y.
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ * \return
+ * - Returns the floating-point remainder of \p x / \p y.
+ * - fmodf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p y) returns 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  if \p y is not zero.
+ * - fmodf(\p x, 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns \p x if \p x is finite.
+ * - fmodf(\p x, \p y) returns NaN if \p x is 
+ * \latexonly $\pm\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus --></m:mo>
+ *   <m:mi mathvariant="normal">&#x221E;<!-- &Infinity --></m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmodf(float x, float y) __THROW;
+#if defined(__QNX__)
+/* redeclare some builtins that QNX uses */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FLog(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FCosh(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FSinh(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FSinx(float, unsigned int, int);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int _FDsign(float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int _Dsign(double);
+#endif
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+#endif /* _WIN32 && _M_AMD64 */
+
+}
+
+#if !defined(__CUDACC_RTC__)
+#include <math.h>
+#include <stdlib.h>
+
+#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
+#include <cmath>
+#include <cstdlib>
+#endif /* __CUDA_INTERNAL_SKIP_CPP_HEADERS__ */
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+
+#elif defined(__GNUC__)
+
+#undef signbit
+#undef isfinite
+#undef isnan
+#undef isinf
+
+#if defined(__APPLE__)
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(double x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(float x); 
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(double x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+#else /* !(!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000) */
+template <typename T>
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool __libcpp_isnan(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(float x) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY  __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(long double x) _NOEXCEPT;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000 */
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+#else /* !(!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000) */
+template <typename T>
+__cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ bool __libcpp_isinf(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(float x) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(long double x) _NOEXCEPT;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000 */
+
+#else /* __APPLE__ */
+
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+#if !defined(_NVHPC_CUDA)
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(float x);
+/* GCC 6.1 uses ::isnan(double x) for isnan(double x) if the condition is true */
+#if _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#else /* !(_GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC) */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(double x);
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(float x);
+/* GCC 6.1 uses ::isinf(double x) for isinf(double x) if the condition is true. */
+#if _GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#else /* !(_GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC) */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(double x);
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(long double x);
+}
+#endif
+
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+
+#if defined(__QNX__)
+#if (__QNX__) && !defined(_LIBCPP_VERSION)
+/* QNX defines functions in std, need to declare them here */
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(long double x);
+}
+#else
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const float x);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const double x);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const long double x);
+#endif
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const long double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const long double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const long double a);
+#else /* ! __QNX__ */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const float x);
+#if defined(__ICC)
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const double x) throw();
+#else /* !__ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const double x);
+#endif /* __ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const float x);
+#if defined(__ICC)
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const double x) throw();
+#else /* !__ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const double x);
+#endif /* __ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const long double x);
+
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+template <typename T>
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool __libcpp_isnan(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(float x) _NOEXCEPT;
+#else /* !((defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+#endif /* (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+#if defined(__ANDROID__) || defined(__HORIZON__)
+#if !defined(_LIBCPP_VERSION)
+__forceinline__
+#endif  /* !defined(_LIBCPP_VERSION) */
+#if _LIBCPP_VERSION >= 7000
+#ifdef _LIBCPP_PREFERRED_OVERLOAD
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_PREFERRED_OVERLOAD __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(double x) _NOEXCEPT;
+#endif /* _LIBCPP_PREFERRED_OVERLOAD */
+#else /* _LIBCPP_VERSION < 7000 */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x);
+#endif /* _LIBCPP_VERSION >= 7000 */
+#else /* !(__ANDROID__ || __HORIZON__) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#endif /* __ANDROID__ */
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+inline _LIBCPP_INLINE_VISIBILITY  __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(long double x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+#endif /* (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+static __inline__ __cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ unsigned __FLOAT_BITS(float __f);
+static __inline__ __cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ unsigned long long __DOUBLE_BITS(double __f);
+template <typename T>
+__cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ bool __libcpp_isinf(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(float x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+#endif /* (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+
+#if defined(__ANDROID__) || defined(__HORIZON__)
+#if !defined(_LIBCPP_VERSION)
+__forceinline__
+#endif  /* !defined(_LIBCPP_VERSION) */
+#if _LIBCPP_VERSION >= 7000
+#ifdef _LIBCPP_PREFERRED_OVERLOAD
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_PREFERRED_OVERLOAD __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(double x) _NOEXCEPT;
+#endif /* _LIBCPP_PREFERRED_OVERLOAD */
+#else /* _LIBCPP_VERSION < 7000 */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x);
+#endif /* _LIBCPP_VERSION >= 7000 */
+#else /* ! (__ANDROID__  || __HORIZON__) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#endif /* __ANDROID__ || __HORIZON__ */
+#if (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(long double x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+#endif  /* (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+#endif /* __QNX__  */
+
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* __APPLE__ */
+
+#if !defined(_LIBCPP_VERSION)
+#if defined(__clang__)
+#if __has_include(<ext/random>)
+#define __NV_GLIBCXX_VERSION 40800
+#endif /* __has_include(<random>) */
+#endif /* __clang__ */
+
+#if !defined(__NV_GLIBCXX_VERSION)
+#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 
+#endif /* !__NV_GLIBCXX_VERSION */
+#endif /* !defined(_LIBCPP_VERSION) */
+
+#if !defined(__HORIZON__) || !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800
+#if defined(__arm__) && !defined(_STLPORT_VERSION) && !_GLIBCXX_USE_C99
+#if !defined(__ANDROID__) || (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800)
+
+#if defined(__QNX__)
+/* QNX defines functions in std, need to declare them here */
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs (long long int a);
+}
+#elif defined(__HORIZON__)
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+_LIBCPP_BEGIN_NAMESPACE_STD
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs (long long int a) throw();
+_LIBCPP_END_NAMESPACE_STD
+#else
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+#endif /* __QNX__ || __HORIZON__*/
+
+#endif /* !__ANDROID__ || (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800) */
+#endif /* __arm__ && !_STLPORT_VERSION && !_GLIBCXX_USE_C99 */
+#endif /* !defined(__HORIZON__) || !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800 */
+
+#if defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800 && !defined(__ibmxl__)
+
+#if !defined(_STLPORT_VERSION)
+namespace __gnu_cxx
+{
+#endif /* !_STLPORT_VERSION */
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+
+#if !defined(_STLPORT_VERSION)
+}
+#endif /* !_STLPORT_VERSION */
+
+#endif /* defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800 && !__ibmxl__ */
+
+namespace std
+{
+  template<typename T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T __pow_helper(T, int);
+  template<typename T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T __cmath_power(T, unsigned int);
+}
+
+using std::abs;
+using std::fabs;
+using std::ceil;
+using std::floor;
+using std::sqrt;
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800
+using std::pow;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800 */
+using std::log;
+using std::log10;
+using std::fmod;
+using std::modf;
+using std::exp;
+using std::frexp;
+using std::ldexp;
+using std::asin;
+using std::sin;
+using std::sinh;
+using std::acos;
+using std::cos;
+using std::cosh;
+using std::atan;
+using std::atan2;
+using std::tan;
+using std::tanh;
+
+#elif defined(_WIN32)
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __CUDA_MATH_CRTIMP double __cdecl _hypot(double x, double y);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __CUDA_MATH_CRTIMP float  __cdecl _hypotf(float x, float y);
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int signbit(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(long double) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _ldsign(long double);
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is negative. 
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE signbit(double a);
+#undef __RETURN_TYPE 
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is negative. 
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __RETURN_TYPE signbit(double) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _dsign(double);
+#undef __RETURN_TYPE 
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'.  Returns a nonzero value 
+ * if and only if \p a is negative.  
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE signbit(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'.  Returns a nonzero value 
+ * if and only if \p a is negative.  
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __RETURN_TYPE signbit(float) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _fdsign(float);
+#undef __RETURN_TYPE
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isinf(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ * \return
+ * - With Visual Studio 2013 host compiler: Returns true if and only 
+ * if \p a is an infinite value.
+ * - With other host compilers: Returns a nonzero value if and only 
+ * if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ * \return
+ * - With Visual Studio 2013 host compiler: Returns true if and only 
+ * if \p a is an infinite value.
+ * - With other host compilers: Returns a nonzero value if and only 
+ * if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is an infinite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero 
+ * value if and only if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is an infinite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero 
+ * value if and only if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isnan(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isfinite(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+template<class T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T _Pow_int(T, int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+template<class T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T _Pow_int(T, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int) throw();
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* __CUDACC_RTC__ */
+
+#if __cplusplus >= 201103L
+#define __NV_NOEXCEPT noexcept
+#else /* !__cplusplus >= 201103L */
+#define __NV_NOEXCEPT throw()
+#endif /* __cplusplus >= 201103L */
+
+#if defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION)
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+#endif /* __clang__ */
+#if _LIBCPP_VERSION < 3800
+_LIBCPP_BEGIN_NAMESPACE_STD
+#endif /* _LIBCPP_VERSION < 3800 */
+#elif defined(__GNUC__) && !defined(_STLPORT_VERSION)
+namespace std {
+#endif /* defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION) ||
+          __GNUC__ && !_STLPORT_VERSION */
+
+#if defined(__CUDACC_RTC__) || defined(__GNUC__)
+
+#if defined(__CUDACC_RTC__) || \
+    (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION >= 40800) || \
+    defined(__ibmxl__)
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int);
+#endif /* __CUDACC__RTC__ ||
+          (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION >= 40800) ||
+          __ibmxl__ */
+
+#endif /* __CUDACC_RTC__ || __GNUC__ */
+
+#if defined(__CUDACC_RTC__) || \
+    (!defined(_MSC_VER) || _MSC_VER < 1800) && \
+    (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101))
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long int __cdecl abs(long int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl abs(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl abs(double);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fabs(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ceil(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl floor(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sqrt(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, float);
+
+#if !defined(__QNX__)
+     
+#if defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION)
+template<typename _Tp, typename _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+typename __gnu_cxx::__promote_2<_Tp, _Up>::__type pow(_Tp, _Up);
+#else  /* !(defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION)) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl pow(double, int);
+#endif  /* defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION) */
+     
+#endif  /* !defined(__QNX__) */
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log10(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fmod(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl modf(float, float*);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl exp(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl frexp(float, int*);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ldexp(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl asin(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sin(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sinh(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl acos(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cos(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cosh(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan2(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tan(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tanh(float);
+#else /* __CUDACC_RTC__ ||
+         (!defined(_MSC_VER) || _MSC_VER < 1800) &&
+         (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101)) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long int __cdecl abs(long int) throw();
+#if defined(_LIBCPP_VERSION)
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int __cdecl abs(long long int) throw();
+#endif /* defined(_LIBCPP_VERSION) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl abs(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl abs(double) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fabs(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ceil(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl floor(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sqrt(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, float) throw();
+#if defined(_LIBCPP_VERSION)
+#if (defined (__ANDROID__) || defined(__HORIZON__)) && (_LIBCPP_VERSION >= 9000)
+template <class _A1, class _A2>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+#if _LIBCPP_VERSION >= 14000
+typename std::__enable_if_t
+#else /* _LIBCPP_VERSION < 14000 */
+typename std::_EnableIf
+#endif /*  _LIBCPP_VERSION >= 14000 */
+<
+    std::is_arithmetic<_A1>::value &&
+    std::is_arithmetic<_A2>::value,
+    std::__promote<_A1, _A2>
+>::type pow(_A1 __lcpp_x, _A2 __lcpp_y) __NV_NOEXCEPT;
+#elif (defined(__APPLE__) && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800 || defined(__QNX__)
+template <class _Tp, class _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+#if _LIBCPP_VERSION >= 13000
+typename std::enable_if <
+#else /* _LIBCPP_VERSION < 13000  */
+typename std::__lazy_enable_if <
+#endif /* _LIBCPP_VERSION >= 13000  */
+  std::is_arithmetic<_Tp>::value && std::is_arithmetic<_Up>::value,
+  std::__promote<_Tp, _Up>
+>::type pow(_Tp __x, _Up __y) __NV_NOEXCEPT;
+#else /* !((__APPLE__ && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800) */
+template <class _Tp, class _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+typename enable_if <
+  std::is_arithmetic<_Tp>::value && std::is_arithmetic<_Up>::value,
+  typename std::__promote<_Tp, _Up>::type
+>::type pow(_Tp __x, _Up __y) __NV_NOEXCEPT;
+#endif /* (__APPLE__ && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800 */
+#else /* !defined(_LIBCPP_VERSION) */
+#if !(defined(__GNUC__) && __cplusplus >= 201103L)
+#if (defined(_MSC_VER) && (_MSC_VER >= 1928)) && !(defined __CUDA_INTERNAL_SKIP_CPP_HEADERS__)
+template <class _Ty1, class _Ty2, ::std:: enable_if_t< ::std:: is_arithmetic_v<_Ty1> && ::std:: is_arithmetic_v<_Ty2>, int> > [[nodiscard]] __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ ::std:: _Common_float_type_t<_Ty1, _Ty2> __cdecl pow(_Ty1 _Left, _Ty2 _Right) noexcept;
+#else
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl pow(double, int) throw();
+#endif /* (defined(_MSC_VER) && (_MSC_VER >= 1928)) && !(defined __CUDA_INTERNAL_SKIP_CPP_HEADERS__) */
+#endif /* !(defined(__GNUC__) && __cplusplus >= 201103L) */
+#endif /* defined(_LIBCPP_VERSION) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log10(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fmod(float, float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl modf(float, float*) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl exp(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl frexp(float, int*) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ldexp(float, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl asin(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sin(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sinh(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl acos(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cos(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cosh(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan2(float, float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tan(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tanh(float) throw();
+#endif /* __CUDACC_RTC__ ||
+          (!defined(_MSC_VER) || _MSC_VER < 1800) &&
+          (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101)) */
+
+#if defined(_LIBCPP_VERSION) && defined(_LIBCPP_END_NAMESPACE_STD) && !defined(_STLPORT_VERSION)
+#if _LIBCPP_VERSION < 3800
+_LIBCPP_END_NAMESPACE_STD
+#endif /* _LIBCPP_VERSION < 3800 */
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif /* __clang__ */
+#elif defined(__GNUC__) && !defined(_STLPORT_VERSION)
+}
+#endif /* defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION) ||
+          __GNUC__ && !_STLPORT_VERSION */
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __NV_NOEXCEPT
+
+#if defined(__CUDACC_RTC__)
+#define __MATH_FUNCTIONS_DECL__ __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline __host__ __device__ __cudart_builtin__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ static inline __device__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#if defined(__QNX__) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)
+#if defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)
+#if defined(_LIBCPP_VERSION)
+#define __NV_NOEXCEPT _NOEXCEPT
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+#define __NV_NOEXCEPT
+namespace std {
+__host__ __device__ __cudart_builtin__ int ilogbf(float a);
+#endif
+#else /* !(defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)) */
+#define __NV_NOEXCEPT _NOEXCEPT
+#endif /* defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000) */
+__host__ __device__ __cudart_builtin__ float logb(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ int ilogb(float a) __NV_NOEXCEPT;
+
+__host__ __device__ __cudart_builtin__ float scalbn(float a, int b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float scalbln(float a, long int b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float exp2(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float expm1(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float log2(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float log1p(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float acosh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float asinh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float atanh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float hypot(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float cbrt(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float erf(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float erfc(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float lgamma(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float tgamma(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float copysign(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float nextafter(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float remainder(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float remquo(float a, float b, int *quo) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float round(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long int lround(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long long int llround(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float trunc(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float rint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long int lrint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long long int llrint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float nearbyint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fdim(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fma(float a, float b, float c) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fmax(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fmin(float a, float b) __NV_NOEXCEPT;
+#if defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)
+#if defined(_LIBCPP_VERSION)
+_LIBCPP_END_NAMESPACE_STD
+using _VSTD::logb;
+using _VSTD::ilogb;
+using _VSTD::scalbn;
+using _VSTD::scalbln;
+using _VSTD::exp2;
+using _VSTD::expm1;
+using _VSTD::log2;
+using _VSTD::log1p;
+using _VSTD::acosh;
+using _VSTD::asinh;
+using _VSTD::atanh;
+using _VSTD::hypot;
+using _VSTD::cbrt;
+using _VSTD::erf;
+using _VSTD::erfc;
+using _VSTD::lgamma;
+using _VSTD::tgamma;
+using _VSTD::copysign;
+using _VSTD::nextafter;
+using _VSTD::remainder;
+using _VSTD::remquo;
+using _VSTD::round;
+using _VSTD::lround;
+using _VSTD::llround;
+using _VSTD::trunc;
+using _VSTD::rint;
+using _VSTD::lrint;
+using _VSTD::llrint;
+using _VSTD::nearbyint;
+using _VSTD::fdim;
+using _VSTD::fma;
+using _VSTD::fmax;
+using _VSTD::fmin;
+#else
+}
+#endif
+#endif /* defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000) */
+#undef __NV_NOEXCEPT
+#else /* !(defined(__QNX__ ) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)) */
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+namespace std {
+__host__ __device__ __cudart_builtin__ constexpr float logb(float a);
+__host__ __device__ __cudart_builtin__ constexpr int ilogb(float a);
+__host__ __device__ __cudart_builtin__ constexpr float scalbn(float a, int b);
+__host__ __device__ __cudart_builtin__ constexpr float scalbln(float a, long int b);
+__host__ __device__ __cudart_builtin__ constexpr float exp2(float a);
+__host__ __device__ __cudart_builtin__ constexpr float expm1(float a);
+__host__ __device__ __cudart_builtin__ constexpr float log2(float a);
+__host__ __device__ __cudart_builtin__ constexpr float log1p(float a);
+__host__ __device__ __cudart_builtin__ constexpr float acosh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float asinh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float atanh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float hypot(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float cbrt(float a);
+__host__ __device__ __cudart_builtin__ constexpr float erf(float a);
+__host__ __device__ __cudart_builtin__ constexpr float erfc(float a);
+__host__ __device__ __cudart_builtin__ constexpr float lgamma(float a);
+__host__ __device__ __cudart_builtin__ constexpr float tgamma(float a);
+__host__ __device__ __cudart_builtin__ constexpr float copysign(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float nextafter(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float remainder(float a, float b);
+__host__ __device__ __cudart_builtin__ float remquo(float a, float b, int *quo);
+__host__ __device__ __cudart_builtin__ constexpr float round(float a);
+__host__ __device__ __cudart_builtin__ constexpr long int lround(float a);
+__host__ __device__ __cudart_builtin__ constexpr long long int llround(float a);
+__host__ __device__ __cudart_builtin__ constexpr float trunc(float a);
+__host__ __device__ __cudart_builtin__ constexpr float rint(float a);
+__host__ __device__ __cudart_builtin__ constexpr long int lrint(float a);
+__host__ __device__ __cudart_builtin__ constexpr long long int llrint(float a);
+__host__ __device__ __cudart_builtin__ constexpr float nearbyint(float a);
+__host__ __device__ __cudart_builtin__ constexpr float fdim(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float fma(float a, float b, float c);
+__host__ __device__ __cudart_builtin__ constexpr float fmax(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float fmin(float a, float b);
+}
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+__MATH_FUNCTIONS_DECL__ float logb(float a);
+
+__MATH_FUNCTIONS_DECL__ int ilogb(float a);
+
+__MATH_FUNCTIONS_DECL__ float scalbn(float a, int b);
+
+__MATH_FUNCTIONS_DECL__ float scalbln(float a, long int b);
+
+__MATH_FUNCTIONS_DECL__ float exp2(float a);
+
+__MATH_FUNCTIONS_DECL__ float expm1(float a);
+
+__MATH_FUNCTIONS_DECL__ float log2(float a);
+
+__MATH_FUNCTIONS_DECL__ float log1p(float a);
+
+__MATH_FUNCTIONS_DECL__ float acosh(float a);
+
+__MATH_FUNCTIONS_DECL__ float asinh(float a);
+
+__MATH_FUNCTIONS_DECL__ float atanh(float a);
+
+__MATH_FUNCTIONS_DECL__ float hypot(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float cbrt(float a);
+
+__MATH_FUNCTIONS_DECL__ float erf(float a);
+
+__MATH_FUNCTIONS_DECL__ float erfc(float a);
+
+__MATH_FUNCTIONS_DECL__ float lgamma(float a);
+
+__MATH_FUNCTIONS_DECL__ float tgamma(float a);
+
+__MATH_FUNCTIONS_DECL__ float copysign(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float nextafter(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float remainder(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float remquo(float a, float b, int *quo);
+
+__MATH_FUNCTIONS_DECL__ float round(float a);
+
+__MATH_FUNCTIONS_DECL__ long int lround(float a);
+
+__MATH_FUNCTIONS_DECL__ long long int llround(float a);
+
+__MATH_FUNCTIONS_DECL__ float trunc(float a);
+
+__MATH_FUNCTIONS_DECL__ float rint(float a);
+
+__MATH_FUNCTIONS_DECL__ long int lrint(float a);
+
+__MATH_FUNCTIONS_DECL__ long long int llrint(float a);
+
+__MATH_FUNCTIONS_DECL__ float nearbyint(float a);
+
+__MATH_FUNCTIONS_DECL__ float fdim(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float fma(float a, float b, float c);
+
+__MATH_FUNCTIONS_DECL__ float fmax(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float fmin(float a, float b);
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* defined(__QNX__) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800) */
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __host__ __device__ __cudart_builtin__ float __cdecl logb(float) throw();
+extern __host__ __device__ __cudart_builtin__ int   __cdecl ilogb(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl scalbn(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl scalbln(float, long int) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl exp2(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl expm1(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl log2(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl log1p(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl acosh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl asinh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl atanh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl hypot(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl cbrt(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl erf(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl erfc(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl lgamma(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl tgamma(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl copysign(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl nextafter(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl remainder(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl remquo(float, float, int *) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl round(float) throw();
+extern __host__ __device__ __cudart_builtin__ long int      __cdecl lround(float) throw();
+extern __host__ __device__ __cudart_builtin__ long long int __cdecl llround(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl trunc(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl rint(float) throw();
+extern __host__ __device__ __cudart_builtin__ long int      __cdecl lrint(float) throw();
+extern __host__ __device__ __cudart_builtin__ long long int __cdecl llrint(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl nearbyint(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fdim(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fma(float, float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fmax(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fmin(float, float) throw();
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+__MATH_FUNCTIONS_DECL__ float exp10(const float a);
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(const float a);
+
+__MATH_FUNCTIONS_DECL__ float rcbrt(const float a);
+
+__MATH_FUNCTIONS_DECL__ float sinpi(const float a);
+
+__MATH_FUNCTIONS_DECL__ float cospi(const float a);
+
+__MATH_FUNCTIONS_DECL__ void sincospi(const float a, float *const sptr, float *const cptr);
+
+__MATH_FUNCTIONS_DECL__ void sincos(const float a, float *const sptr, float *const cptr);
+
+__MATH_FUNCTIONS_DECL__ float j0(const float a);
+
+__MATH_FUNCTIONS_DECL__ float j1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float jn(const int n, const float a);
+
+__MATH_FUNCTIONS_DECL__ float y0(const float a);
+
+__MATH_FUNCTIONS_DECL__ float y1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float yn(const int n, const float a);
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i0(const float a);
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfcinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float normcdfinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float normcdf(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfcx(const float a);
+
+__MATH_FUNCTIONS_DECL__ double copysign(const double a, const float b);
+
+__MATH_FUNCTIONS_DECL__ double copysign(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p int and \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int and \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long int min(const long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p int and \p unsigned \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p int and \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long long int min(const long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int and \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int and \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the minimum value of the input \p float arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fminf() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ float min(const float a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p float arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmin() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const double a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p float and \p double arguments.
+ *
+ * Convert \p float argument \p a to \p double, followed by ::fmin().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p double and \p float arguments.
+ *
+ * Convert \p float argument \p b to \p double, followed by ::fmin().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const double a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p int and \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int and \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long int max(const long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p int and \p unsigned \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p int and \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long long int max(const long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int and \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int and \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the maximum value of the input \p float arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmaxf() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ float max(const float a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p float arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmax() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const double a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p float and \p double arguments.
+ *
+ * Convert \p float argument \p a to \p double, followed by ::fmax().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p double and \p float arguments.
+ *
+ * Convert \p float argument \p b to \p double, followed by ::fmax().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const double a, const float b);
+
+#undef __MATH_FUNCTIONS_DECL__
+#undef __MATH_FUNCTIONS_DEVICE_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#undef EXCLUDE_FROM_RTC
+
+extern "C"{
+inline __device__ void *__nv_aligned_device_malloc(size_t size, size_t align)
+{
+  __device__ void *__nv_aligned_device_malloc_impl(size_t, size_t);
+  return __nv_aligned_device_malloc_impl(size, align);
+}
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+* ONLY FOR HOST CODE! NOT FOR DEVICE EXECUTION                                 *
+*                                                                              *
+*******************************************************************************/
+
+#include <crt/func_macro.h>
+
+#if defined(_WIN32)
+#pragma warning (push)
+#pragma warning (disable : 4211)
+
+#endif /* _WIN32 */
+
+__func__(double rsqrt(double a));
+
+__func__(double rcbrt(double a));
+
+__func__(double sinpi(double a));
+
+__func__(double cospi(double a));
+
+__func__(void sincospi(double a, double *sptr, double *cptr));
+
+__func__(double erfinv(double a));
+
+__func__(double erfcinv(double a));
+
+__func__(double normcdfinv(double a));
+
+__func__(double normcdf(double a));
+
+__func__(double erfcx(double a));
+
+__func__(float rsqrtf(float a));
+
+__func__(float rcbrtf(float a));
+
+__func__(float sinpif(float a));
+
+__func__(float cospif(float a));
+
+__func__(void sincospif(float a, float *sptr, float *cptr));
+
+__func__(float erfinvf(float a));
+
+__func__(float erfcinvf(float a));
+
+__func__(float normcdfinvf(float a));
+
+__func__(float normcdff(float a));
+
+__func__(float erfcxf(float a));
+
+__func__(int min(int a, int b));
+
+__func__(unsigned int umin(unsigned int a, unsigned int b));
+
+__func__(long long int llmin(long long int a, long long int b));
+
+__func__(unsigned long long int ullmin(unsigned long long int a, unsigned long long int b));
+
+__func__(int max(int a, int b));
+
+__func__(unsigned int umax(unsigned int a, unsigned int b));
+
+__func__(long long int llmax(long long int a, long long int b));
+
+__func__(unsigned long long int ullmax(unsigned long long int a, unsigned long long int b));
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__ANDROID__)
+
+__func__(int __isnan(double a));
+
+#endif /* _WIN32 || __APPLE__ || __ANDROID__ */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__QNX__)
+
+__func__(void sincos(double a, double *sptr, double *cptr));
+
+#endif /* _WIN32 || __APPLE__ || __QNX__ */
+
+#if defined(_WIN32) || defined(__APPLE__)
+
+__func__(double exp10(double a));
+
+__func__(float exp10f(float a));
+
+__func__(void sincosf(float a, float *sptr, float *cptr));
+
+__func__(int __isinf(double a));
+
+#endif /* _WIN32 || __APPLE__ */
+
+#if (defined(_WIN32) && (!defined(_MSC_VER) || _MSC_VER < 1800)) || defined (__ANDROID__)
+
+__func__(double log2(double a));
+
+#endif /* (_WIN32 && (!defined(_MSC_VER) || _MSC_VER < 1800)) || __ANDROID__ */
+
+#if defined(_WIN32)
+
+__func__(int __signbit(double a));
+
+__func__(int __finite(double a));
+
+__func__(int __signbitl(long double a));
+
+__func__(int __signbitf(float a));
+
+__func__(int __finitel(long double a));
+
+__func__(int __finitef(float a));
+
+__func__(int __isinfl(long double a));
+
+__func__(int __isinff(float a));
+
+__func__(int __isnanl(long double a));
+
+__func__(int __isnanf(float a));
+
+#endif /* _WIN32 */
+
+#if defined(_WIN32) && (!defined(_MSC_VER) || _MSC_VER < 1800)
+
+__func__(double copysign(double a, double b));
+
+__func__(double fmax(double a, double b));
+
+__func__(double fmin(double a, double b));
+
+__func__(double trunc(double a));
+
+__func__(double round(double a));
+
+__func__(long int lround(double a));
+
+__func__(long long int llround(double a));
+
+__func__(double rint(double a));
+
+__func__(double nearbyint(double a));
+
+__func__(long int lrint(double a));
+
+__func__(long long int llrint(double a));
+
+__func__(double fdim(double a, double b));
+
+__func__(double scalbn(double a, int b));
+
+__func__(double scalbln(double a, long int b));
+
+__func__(double exp2(double a));
+
+__func__(double log1p(double a));
+
+__func__(double expm1(double a));
+
+__func__(double cbrt(double a));
+
+__func__(double acosh(double a));
+
+__func__(double asinh(double a));
+
+__func__(double atanh(double a));
+
+__func__(int ilogb(double a));
+
+__func__(double logb(double a));
+
+__func__(double remquo(double a, double b, int *quo));
+
+__func__(double remainder(double a, double b));
+
+__func__(double fma (double a, double b, double c));
+
+__func__(double nextafter(double a, double b));
+
+__func__(double erf(double a));
+
+__func__(double erfc(double a));
+
+__func__(double lgamma(double a));
+
+__func__(unsigned long long int __internal_host_nan_kernel(const char *s));
+
+__func__(double nan(const char *tagp));
+
+__func__(double __host_tgamma_kernel(double a));
+
+__func__(double __host_stirling_poly(double a));
+
+__func__(double __host_tgamma_stirling(double a));
+
+__func__(double tgamma(double a));
+
+__func__(float fmaxf(float a, float b));
+
+__func__(float fminf(float a, float b));
+
+__func__(float roundf(float a));
+
+__func__(long int lroundf(float a));
+
+__func__(long long int llroundf(float a));
+
+__func__(float truncf(float a));
+
+__func__(float rintf(float a));
+
+__func__(float nearbyintf(float a));
+
+__func__(long int lrintf(float a));
+
+__func__(long long int llrintf(float a));
+
+__func__(float logbf(float a));
+
+__func__(float scalblnf(float a, long int b));
+
+__func__(float log2f(float a));
+
+__func__(float exp2f(float a));
+
+__func__(float acoshf(float a));
+
+__func__(float asinhf(float a));
+
+__func__(float atanhf(float a));
+
+__func__(float cbrtf(float a));
+
+__func__(float expm1f(float a));
+
+__func__(float fdimf(float a, float b));
+
+__func__(float log1pf(float a));
+
+__func__(float scalbnf(float a, int b));
+
+__func__(float fmaf(float a, float b, float c));
+
+__func__(int ilogbf(float a));
+
+__func__(float erff(float a));
+
+__func__(float erfcf(float a));
+
+__func__(float lgammaf(float a));
+
+__func__(float tgammaf(float a));
+
+__func__(float remquof(float a, float b, int *quo));
+
+__func__(float remainderf(float a, float b));
+
+__func__(float copysignf(float a, float b));
+
+__func__(float nextafterf(float a, float b));
+
+__func__(float nanf(const char *tagp));
+
+#endif /* _WIN32 && (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* _WIN32 */
+
+#endif /* !__CUDACC__ */
+
+#undef EXCLUDE_FROM_RTC
+
+#if !defined(__CUDACC_RTC__)
+
+#include "math_functions.hpp"
+
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__MATH_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc09b915ea07f8ef376f5c3640f963a09e86dbfd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp
@@ -0,0 +1,3398 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/math_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/math_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__MATH_FUNCTIONS_HPP__)
+#define __MATH_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+
+__host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+__host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(static_cast<double>(x));}
+
+__host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+__host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+__host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__host__ __device__ __cudart_builtin__ int isnan(const double x) { return __isnan(x); }
+__host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+__host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ long long int abs(const long long int a) { return llabs(a); }
+
+__host__ __device__ __cudart_builtin__ long int  abs(const long int in)        { return llabs(in); }
+__host__ __device__ __cudart_builtin__ float     abs(const float in)           { return fabsf(in); }
+__host__ __device__ __cudart_builtin__ double    abs(const double in)          { return fabs(in); }
+__host__ __device__ __cudart_builtin__ float     fabs(const float in)          { return fabsf(in); }
+__host__ __device__ __cudart_builtin__ float     ceil(const float in)          { return ceilf(in); }
+__host__ __device__ __cudart_builtin__ float     floor(const float in)         { return floorf(in); }
+__host__ __device__ __cudart_builtin__ float     sqrt(const float in)          { return sqrtf(in); }
+__host__ __device__ __cudart_builtin__ float     pow(const float a, const float b)   { return powf(a, b); }
+extern "C" __device__ float powif(float, int); 
+__host__ __device__ __cudart_builtin__ float     pow(const float a, const int b)     { return powif(a, b); }
+extern "C" __device__ double powi(double, int);
+__host__ __device__ __cudart_builtin__ double    pow(const double a, const int b)    { return powi(a, b); }
+__host__ __device__ __cudart_builtin__ float     log(const float in)           { return logf(in); }
+__host__ __device__ __cudart_builtin__ float     log10(const float in)         { return log10f(in); }
+__host__ __device__ __cudart_builtin__ float     fmod(const float a, const float b)  { return fmodf(a, b); }
+__host__ __device__ __cudart_builtin__ float     modf(const float a, float*b)  { return modff(a, b); }
+__host__ __device__ __cudart_builtin__ float     exp(const float in)           { return expf(in); }
+__host__ __device__ __cudart_builtin__ float     frexp(const float a, int*b)   { return frexpf(a, b); }
+__host__ __device__ __cudart_builtin__ float     ldexp(const float a, int b)   { return ldexpf(a, b); }
+__host__ __device__ __cudart_builtin__ float     asin(const float in)          { return asinf(in); }
+__host__ __device__ __cudart_builtin__ float     sin(const float in)           { return sinf(in); }
+__host__ __device__ __cudart_builtin__ float     sinh(const float in)          { return sinhf(in); }
+__host__ __device__ __cudart_builtin__ float     acos(const float in)          { return acosf(in); }
+__host__ __device__ __cudart_builtin__ float     cos(const float in)           { return cosf(in); }
+__host__ __device__ __cudart_builtin__ float     cosh(const float in)          { return coshf(in); }
+__host__ __device__ __cudart_builtin__ float     atan(const float in)          { return atanf(in); }
+__host__ __device__ __cudart_builtin__ float     atan2(const float a, const float b) { return atan2f(a, b); }
+__host__ __device__ __cudart_builtin__ float     tan(const float in)           { return tanf(in); }
+__host__ __device__ __cudart_builtin__ float     tanh(const float in)          { return tanhf(in); }
+
+#elif defined(__GNUC__)
+
+#undef signbit
+#undef isfinite
+#undef isnan
+#undef isinf
+
+#if defined(_LIBCPP_VERSION)
+extern "C" __device__ float powif(float, int);
+extern "C" __device__ double powi(double, int);
+#endif /* _LIBCPP_VERSION */
+
+#if defined(__APPLE__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbitd(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __isfinitef(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __isfinited(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __isfinite(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) throw()  { return __isnand(x); }
+#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnan(x); }
+#endif /* defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000 */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) throw()  { return __isinfd(x); }
+#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinf(x); }
+#endif /* defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000 */
+#else /* __APPLE__ */
+
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+#if defined(__CUDA_ARCH__)
+#define __NV_BUILTIN_FUNC_DECL__ __forceinline__ __host__ __device__ __cudart_builtin__
+#if _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__NV_BUILTIN_FUNC_DECL__ int  isnan(const double a) throw() { return __isnan(a); }
+__NV_BUILTIN_FUNC_DECL__ int  isinf(const double x) throw() { return __isinf(x); }
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+#undef __NV_BUILTIN_FUNC_DECL__
+#endif /* __CUDA_ARCH */
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+
+#if defined(__QNX__)
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const float x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbitf(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<float>(x);
+#endif /* __CUDA_ARCH__ */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const double x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbit(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<double>(x);
+#endif /* __CUDA_ARCH__ */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const long double x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbitl(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<long double>(x);
+#endif /* __CUDA_ARCH__ */
+}
+#endif /* (__QNX__ && _LIBCPP_VERSION) */
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitel(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finite(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitef(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnan(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinfl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinff(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+#elif ( (defined(__ANDROID__) || defined(__HORIZON__)) && defined(_LIBCPP_VERSION))
+#if defined(__CUDA_ARCH__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x)  { return __isnan(x); }
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(x); }
+#endif  /* _LIBCPP_VERSION < 8000 */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(x); }
+#endif /* _LIBCPP_VERSION < 8000 */
+#else /* !defined(__CUDA_ARCH__) */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return signbit<float>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return signbit<double>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return signbit<long double>(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return isfinite<float>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return isfinite<double>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return isfinite<long double>(x); }
+
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return isnan<float>(x); }
+/* int isnan(double) provided by math.h */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return isnan<long double>(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return isinf<float>(x); }
+/* int isinf(double) provided by math.h */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return isinf<long double>(x); }
+#endif /* _LIBCPP_VERSION < 8000 */
+
+#endif  /* defined(__CUDA_ARCH__) */
+
+#else /* !__QNX__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+#if defined(__ICC)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) throw() { return __signbit(x); }
+#else /* !__ICC */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+#endif /* __ICC */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) {
+#if defined(__CUDA_ARCH__)
+  return __finitef(x);
+#else	/* !__CUDA_ARCH__ */
+  return __isfinitef(x);
+#endif /* __CUDA_ARCH__ */
+}
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+#endif  /* __ANDROID__ */
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x)
+{
+#ifdef __CUDA_ARCH__
+  return __finite(x);
+#else  /* !__CUDA_ARCH__ */
+  return __isfinite(x);
+#endif  /* __CUDA_ARCH__ */
+}
+#elif defined(__ICC)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) throw() { return __finite(x); }
+#else
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+#endif /* __ANDROID__ */
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x)
+{
+#ifdef __CUDA_ARCH__
+   return __finitel(x);
+#else /* !__CUDA_ARCH__ */
+   return __isfinitel(x);
+#endif  /* __CUDA_ARCH__ */
+}
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(x); }
+#endif  /* __ANDROID__ */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) { return __isnan(x); }
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) throw()  { return __isnan(x); }
+#endif /* __ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) throw()  { return __isinf(x); }
+#endif /* __ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(x); }
+#endif /* __QNX__ || __HORIZON__ */
+
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* __APPLE__ */
+
+#if defined(__arm__) && !defined(_STLPORT_VERSION) && !_GLIBCXX_USE_C99
+#if !defined(__ANDROID__) || (!defined(_LIBCPP_VERSION) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)))
+
+#if !defined(__QNX__) && !defined(__HORIZON__)
+static __inline__ __host__ __device__ __cudart_builtin__ long long int abs(const long long int a)
+{
+  return llabs(a);
+}
+#endif /* !__QNX__ && !__HORIZON__*/
+
+#endif /* !defined(__ANDROID__) || (!defined(_LIBCPP_VERSION) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))) */
+#endif /* __arm__ && !_STLPORT_VERSION && !_GLIBCXX_USE_C99 */
+
+#elif defined(_WIN32)
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const long double a)
+{
+  return __signbitl(a);
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const double a)
+{
+  return __signbit(a);
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const float a)
+{
+  return __signbitf(a);
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const long double a)
+{
+  return __isinfl(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinfl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const double a)
+{
+  return __isinf(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const float a)
+{
+  return __isinff(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ bool isinf(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinff(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const long double a)
+{
+  return __isnanl(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const double a)
+{
+  return __isnan(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnan(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const float a)
+{
+  return __isnanf(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double a)
+{
+  return __finitel(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitel(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const double a)
+{
+  return __finite(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finite(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const float a)
+{
+  return __finitef(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitef(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__CUDACC_RTC__)
+#define __MATH_FUNCTIONS_DECL__ __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ static inline __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) || _MSC_VER < 1800)
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+_LIBCPP_BEGIN_NAMESPACE_STD
+#endif /* __QNX__ && _LIBCPP_VERSION */
+#if !defined(__QNX__) && !(defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)
+#if !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L))
+__MATH_FUNCTIONS_DECL__ float logb(const float a)
+{
+  return logbf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ int ilogb(const float a)
+{
+  return ilogbf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float scalbn(const float a, const int b)
+{
+  return scalbnf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float scalbln(const float a, const long int b)
+{
+  return scalblnf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float exp2(const float a)
+{
+  return exp2f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float expm1(const float a)
+{
+  return expm1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float log2(const float a)
+{
+  return log2f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float log1p(const float a)
+{
+  return log1pf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float acosh(const float a)
+{
+  return acoshf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float asinh(const float a)
+{
+  return asinhf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float atanh(const float a)
+{
+  return atanhf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float hypot(const float a, const float b)
+{
+  return hypotf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float cbrt(const float a)
+{
+  return cbrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erf(const float a)
+{
+  return erff(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfc(const float a)
+{
+  return erfcf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float lgamma(const float a)
+{
+  return lgammaf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float tgamma(const float a)
+{
+  return tgammaf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float copysign(const float a, const float b)
+{
+  return copysignf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float nextafter(const float a, const float b)
+{
+  return nextafterf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float remainder(const float a, const float b)
+{
+  return remainderf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float remquo(const float a, const float b, int *quo)
+{
+  return remquof(a, b, quo);
+}
+
+__MATH_FUNCTIONS_DECL__ float round(const float a)
+{
+  return roundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long int lround(const float a)
+{
+  return lroundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long long int llround(const float a)
+{
+  return llroundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float trunc(const float a)
+{
+  return truncf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rint(const float a)
+{
+  return rintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long int lrint(const float a)
+{
+  return lrintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long long int llrint(const float a)
+{
+  return llrintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float nearbyint(const float a)
+{
+  return nearbyintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float fdim(const float a, const float b)
+{
+  return fdimf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float fma(const float a, const float b, const float c)
+{
+  return fmaf(a, b, c);
+}
+
+__MATH_FUNCTIONS_DECL__ float fmax(const float a, const float b)
+{
+  return fmaxf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float fmin(const float a, const float b)
+{
+  return fminf(a, b);
+}
+#endif /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+#endif /* !(!defined(__QNX__) && !(defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)) */
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+_LIBCPP_END_NAMESPACE_STD
+#endif
+#endif /* __CUDACC_RTC__ || (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+__MATH_FUNCTIONS_DECL__ float exp10(const float a)
+{
+  return exp10f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(const float a)
+{
+  return rsqrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rcbrt(const float a)
+{
+  return rcbrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float sinpi(const float a)
+{
+  return sinpif(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float cospi(const float a)
+{
+  return cospif(a);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincospi(const float a, float *const sptr, float *const cptr)
+{
+  sincospif(a, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincos(const float a, float *const sptr, float *const cptr)
+{
+  sincosf(a, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ float j0(const float a)
+{
+  return j0f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float j1(const float a)
+{
+  return j1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float jn(const int n, const float a)
+{
+  return jnf(n, a);
+}
+
+__MATH_FUNCTIONS_DECL__ float y0(const float a)
+{
+  return y0f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float y1(const float a)
+{
+  return y1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float yn(const int n, const float a)
+{ 
+  return ynf(n, a);
+}
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i0(const float a)
+{
+  return cyl_bessel_i0f(a);
+}
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i1(const float a)
+{
+  return cyl_bessel_i1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfinv(const float a)
+{
+  return erfinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfcinv(const float a)
+{
+  return erfcinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdfinv(const float a)
+{
+  return normcdfinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdf(const float a)
+{
+  return normcdff(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfcx(const float a)
+{
+  return erfcxf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ double copysign(const double a, const float b)
+{
+  return copysign(a, static_cast<double>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ double copysign(const float a, const double b)
+{
+  return copysign(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const unsigned int b)
+{
+  return umin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const int a, const unsigned int b)
+{
+  return umin(static_cast<unsigned int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const int b)
+{
+  return umin(a, static_cast<unsigned int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ long int min(const long int a, const long int b)
+{
+  long int retval;
+  /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  /* long can be of 32-bit type on some systems. */
+  if (sizeof(long int) == sizeof(int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<long int>(min(static_cast<int>(a), static_cast<int>(b)));
+  } else {
+    retval = static_cast<long int>(llmin(static_cast<long long int>(a), static_cast<long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ long long int min(const long long int a, const long long int b)
+{
+  return llmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const unsigned long long int b)
+{
+  return ullmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const long long int a, const unsigned long long int b)
+{
+  return ullmin(static_cast<unsigned long long int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const long long int b)
+{
+  return ullmin(a, static_cast<unsigned long long int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ float min(const float a, const float b)
+{
+  return fminf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const double a, const double b)
+{
+  return fmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const float a, const double b)
+{
+  return fmin(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const double a, const float b)
+{
+  return fmin(a, static_cast<double>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const unsigned int b)
+{
+  return umax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const int a, const unsigned int b)
+{
+  return umax(static_cast<unsigned int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const int b)
+{
+  return umax(a, static_cast<unsigned int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ long int max(const long int a, const long int b)
+{
+  long int retval;
+  /* long can be of 32-bit type on some systems. */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(long int) == sizeof(int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<long int>(max(static_cast<int>(a), static_cast<int>(b)));
+  } else {
+    retval = static_cast<long int>(llmax(static_cast<long long int>(a), static_cast<long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ long long int max(const long long int a, const long long int b)
+{
+  return llmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const unsigned long long int b)
+{
+  return ullmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const long long int a, const unsigned long long int b)
+{
+  return ullmax(static_cast<unsigned long long int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const long long int b)
+{
+  return ullmax(a, static_cast<unsigned long long int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ float max(const float a, const float b)
+{
+  return fmaxf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const double a, const double b)
+{
+  return fmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const float a, const double b)
+{
+  return fmax(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const double a, const float b)
+{
+  return fmax(a, static_cast<double>(b));
+}
+
+
+#if !defined(__CUDA_ARCH__)
+#if defined(_WIN32)
+#define __HELPER_FUNC_LINKAGE static inline __host__ __device__
+#pragma warning (push)
+#pragma warning (disable : 4211)
+#else  /* !defined(_WIN32) */
+#define __HELPER_FUNC_LINKAGE inline __host__ __device__
+#endif  /* defined(_WIN32) */
+
+__HELPER_FUNC_LINKAGE int min(const int a, const int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned int umin(const unsigned int a, const unsigned int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE long long int llmin(const long long int a, const long long int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned long long int ullmin(const unsigned long long int a,
+                                                    const unsigned long long int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE int max(const int a, const int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned int umax(const unsigned int a, const unsigned int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE long long int llmax(const long long int a, const long long int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned long long int ullmax(const unsigned long long int a,
+                                                    const unsigned long long int b)
+{
+  return (a > b) ? a : b;
+}
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* defined(_WIN32) */
+
+#undef __HELPER_FUNC_LINKAGE
+
+#endif /* !defined(__CUDA_ARCH__) */
+
+#undef __MATH_FUNCTIONS_DECL__
+#undef __MATH_FUNCTIONS_DEVICE_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+#if !defined(__CUDACC__)
+
+#include "host_defines.h"
+#include "math_constants.h"
+
+#define __cuda_INT_MAX \
+        ((int)((unsigned int)-1 >> 1))
+
+/*******************************************************************************
+*                                                                              *
+* ONLY FOR HOST CODE! NOT FOR DEVICE EXECUTION                                 *
+*                                                                              *
+*******************************************************************************/
+
+#include <crt/func_macro.h>
+
+#if defined(_WIN32)
+#pragma warning (push)
+#pragma warning (disable : 4211)
+
+#endif /* _WIN32 */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__ANDROID__) || defined(__QNX__)
+
+__func__(int __isnan(const double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) > 0xffe0000000000000ULL;
+}
+
+#endif /* _WIN32 || __APPLE__ || __ANDROID__ || __QNX__ */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined(__QNX__)
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE ROUTINES FOR WINDOWS & APPLE PLATFORMS        *
+*                                                                              *
+*******************************************************************************/
+
+__func__(double exp10(const double a))
+{
+  return pow(10.0, a);
+}
+
+__func__(float exp10f(const float a))
+{
+    return static_cast<float>(exp10(static_cast<double>(a)));
+}
+
+__func__(void sincos(const double a, double *sptr, double *cptr))
+{
+  *sptr = sin(a);
+  *cptr = cos(a);
+}
+
+__func__(void sincosf(const float a, float *sptr, float *cptr))
+{
+  double s, c;
+
+  sincos(static_cast<double>(a), &s, &c);
+  *sptr = static_cast<float>(s);
+  *cptr = static_cast<float>(c);
+}
+
+__func__(int __isinf(const double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) == 0xffe0000000000000ULL;
+}
+
+#endif /* _WIN32 || __APPLE__ */
+
+#if defined(_WIN32) || defined (__ANDROID__)
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double log2(const double a))
+{
+  return log(a) * 1.44269504088896340;
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* _WIN32 || __ANDROID__ */
+
+#if defined(_WIN32)
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE ROUTINES FOR WINDOWS PLATFORM                 *
+*                                                                              *
+*******************************************************************************/
+
+__func__(int __signbit(double a))
+{
+  signed long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return l < 0LL;
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double copysign(double a, double b))
+{
+  unsigned long long int la, lb;
+  memcpy(&la, &a, sizeof(double));
+  memcpy(&lb, &b, sizeof(double));
+  la = (la & 0x7fffffffffffffffULL) | (lb & 0x8000000000000000ULL);
+  memcpy(&a, &la, sizeof(double));
+  return a;
+}
+#endif /* MSC_VER < 1800 */
+
+__func__(int __finite(double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) < 0xffe0000000000000ULL;
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double fmax(double a, double b))
+{
+  if (__isnan(a) && __isnan(b)) return a + b;
+  if (__isnan(a)) return b;
+  if (__isnan(b)) return a;
+  if ((a == 0.0) && (b == 0.0) && __signbit(b)) return a;
+  return a > b ? a : b;
+}
+
+__func__(double fmin(double a, double b))
+{
+  if (__isnan(a) && __isnan(b)) return a + b;
+  if (__isnan(a)) return b;
+  if (__isnan(b)) return a;
+  if ((a == 0.0) && (b == 0.0) && __signbit(a)) return a;
+  return a < b ? a : b;
+}
+
+__func__(double trunc(double a))
+{
+  return a < 0.0 ? ceil(a) : floor(a);
+}
+
+__func__(double round(double a))
+{
+  double fa = fabs(a);
+
+  if (fa > CUDART_TWO_TO_52) {
+    return a;
+  } else {
+    double u = floor(fa + 0.5);
+    if (fa < 0.5) u = 0;
+    u = copysign (u, a);
+    return u;
+  }
+}
+
+__func__(long int lround(double a))
+{
+  return static_cast<long int>(round(a));
+}
+
+__func__(long long int llround(double a))
+{
+  return static_cast<long long int>(round(a));
+}
+
+__func__(double rint(double a))
+{
+  double fa = fabs(a);
+  double u = CUDART_TWO_TO_52 + fa;
+  if (fa >= CUDART_TWO_TO_52) {
+    u = a;
+  } else {
+    u = u - CUDART_TWO_TO_52;
+    u = copysign (u, a);
+  }
+  return u;  
+}
+
+__func__(double nearbyint(double a))
+{
+  return rint(a);
+}
+
+__func__(long int lrint(double a))
+{
+  return static_cast<long int>(rint(a));
+}
+
+__func__(long long int llrint(double a))
+{
+  return static_cast<long long int>(rint(a));
+}
+
+__func__(double fdim(double a, double b))
+{
+  if (a > b) {
+    return (a - b);
+  } else if (a <= b) {
+    return 0.0;
+  } else if (__isnan(a)) {
+    return a;
+  } else {
+    return b;
+  }
+}
+
+__func__(double scalbn(double a, int b))
+{
+  return ldexp(a, b);
+}
+
+__func__(double scalbln(double a, long int b))
+{
+  int t;
+
+  if (b > 2147483647L) {
+    t = 2147483647;
+  } else if (b < (-2147483647 - 1)) {
+    t = (-2147483647 - 1);
+  } else {
+    t = static_cast<int>(b);
+  }
+  return scalbn(a, t);
+}
+
+__func__(double exp2(double a))
+{
+  return pow(2.0, a);
+}
+
+/*  
+ * The following is based on: David Goldberg, "What every computer scientist 
+ * should know about floating-point arithmetic", ACM Computing Surveys, Volume 
+ * 23, Issue 1, March 1991.
+ */
+__func__(double log1p(double a))
+{
+  volatile double u, m;
+
+  u = 1.0 + a;
+  if (u == 1.0) {
+    /* a very close to zero */
+    u = a;
+  } else {
+    m = u - 1.0;
+    u = log(u);
+    if (a < 1.0) {
+      /* a somewhat close to zero */
+      u = a * u;
+      u = u / m;
+    }
+  }
+  return u;
+}
+
+/*
+ * This code based on: http://www.cs.berkeley.edu/~wkahan/Math128/Sumnfp.pdf
+ */
+__func__(double expm1(double a))
+{
+  volatile double u, m;
+
+  u = exp(a);
+  m = u - 1.0;
+  if (m == 0.0) {
+    /* a very close zero */
+    m = a;
+  } 
+  else if (fabs(a) < 1.0) {
+    /* a somewhat close zero */
+    u = log(u);
+    m = m * a;
+    m = m / u;
+  }
+  return m;
+}
+
+__func__(double cbrt(double a))
+{
+  double s, t;
+
+  if (a == 0.0 || __isinf(a)) {
+    return a;
+  } 
+  s = fabs(a);
+  t = exp2(CUDART_THIRD * log2(s));           /* initial approximation */
+  t = t - (t - (s / (t * t))) * CUDART_THIRD; /* refine approximation */
+  t = copysign(t, a);
+  return t;
+}
+
+__func__(double acosh(double a))
+{
+  double s, t;
+
+  t = a - 1.0;
+  if (t == a) {
+    return log(2.0) + log(a);
+  } else {
+    s = a + 1.0;
+    t = t + sqrt(s * t);
+    return log1p(t);
+  }
+}
+
+__func__(double asinh(double a))
+{
+  double fa, oofa, t;
+
+  fa = fabs(a);
+  if (fa > 1e18) {
+    t = log(2.0) + log(fa);
+  } else {
+    oofa = 1.0 / fa;
+    t = fa + fa / (oofa + sqrt(1.0 + oofa * oofa));
+    t = log1p(t);
+  }
+  t = copysign(t, a);
+  return t;
+}
+
+__func__(double atanh(double a))
+{
+  double fa, t;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  fa = fabs(a);
+  t = (2.0 * fa) / (1.0 - fa);
+  t = 0.5 * log1p(t);
+  if (__isnan(t) || !__signbit(a)) {
+    return t;
+  }
+  return -t;
+}
+
+__func__(int ilogb(double a))
+{
+  unsigned long long int i;
+  int expo = -1022;
+
+  if (__isnan(a)) return -__cuda_INT_MAX-1;
+  if (__isinf(a)) return __cuda_INT_MAX;
+  memcpy(&i, &a, sizeof(double));
+  i = i & 0x7fffffffffffffffULL;
+  if (i == 0) return -__cuda_INT_MAX-1;
+  if (i >= 0x0010000000000000ULL) {
+    return (int)(((i >> 52ULL) & 0x7ffU) - 1023);
+  }
+  while (i < 0x0010000000000000ULL) {
+    expo--;
+    i <<= 1;
+  }
+  return expo;
+}
+
+__func__(double logb(double a))
+{
+  unsigned long long int i;
+  int expo = -1022;
+
+  if (__isnan(a)) return a + a;
+  if (__isinf(a)) return fabs(a);
+  memcpy(&i, &a, sizeof(double));
+  i = i & 0x7fffffffffffffffULL;
+  if (i == 0) return -1.0/fabs(a);
+  if (i >= 0x0010000000000000ULL) {
+    return (double)((int)((i >> 52ULL) & 0x7ffU) - 1023);
+  }
+  while (i < 0x0010000000000000ULL) {
+    expo--;
+    i <<= 1;
+  }
+  return static_cast<double>(expo);
+}
+
+__func__(double remquo(double a, double b, int *quo))
+{
+  unsigned long long int aa, bb;
+  int rem1 = 1; /* do FPREM1, a.k.a IEEE remainder */
+  int expo_a;
+  int expo_b;
+  unsigned long long mant_a;
+  unsigned long long mant_b;
+  unsigned long long mant_c;
+  unsigned long long temp;
+  int sign_a;
+  int sign_b;
+  int sign_c;
+  int expo_c;
+  int expodiff;
+  int quot = 0;                 /* initialize quotient */
+  int l;
+  int iter;
+
+  memcpy(&aa, &a, sizeof(double));
+  mant_a = (aa << 11ULL) | 0x8000000000000000ULL;
+  expo_a = (int)((aa >> 52ULL) & 0x7ffU) - 1023;
+  sign_a = (int)(aa >> 63ULL);
+
+  memcpy(&bb, &b, sizeof(double));
+  mant_b = (bb << 11ULL) | 0x8000000000000000ULL;
+  expo_b = (int)((bb >> 52ULL) & 0x7ffU) - 1023;
+  sign_b = (int)(bb >> 63ULL);
+
+  sign_c = sign_a;  /* remainder has sign of dividend */
+  expo_c = expo_a;  /* default */
+      
+  /* handled NaNs and infinities */
+  if (__isnan(a) || __isnan(b)) {
+    *quo = quot;
+    return a + b;
+  }
+  if (__isinf(a) || (b == 0.0)) {
+    *quo = quot;
+    aa = 0xfff8000000000000ULL;
+    memcpy(&a, &aa, sizeof(double));
+    return a;
+  }
+  if ((a == 0.0) || (__isinf(b))) {
+    *quo = quot;
+    return a;
+  }
+  /* normalize denormals */
+  if (expo_a < -1022) {
+    mant_a = mant_a + mant_a;
+    while (mant_a < 0x8000000000000000ULL) {
+      mant_a = mant_a + mant_a;
+      expo_a--;
+    }
+  } 
+  if (expo_b < -1022) {
+    mant_b = mant_b + mant_b;
+    while (mant_b < 0x8000000000000000ULL) {
+      mant_b = mant_b + mant_b;
+      expo_b--;
+    }
+  }
+  expodiff = expo_a - expo_b;
+  /* clamp iterations if exponent difference negative */
+  if (expodiff < 0) {
+    iter = -1;
+  } else {
+    iter = expodiff;
+  }
+  /* Shift dividend and divisor right by one bit to prevent overflow
+     during the division algorithm.
+   */
+  mant_a = mant_a >> 1ULL;
+  mant_b = mant_b >> 1ULL;
+  expo_c = expo_a - iter; /* default exponent of result   */
+
+  /* Use binary longhand division (restoring) */
+  for (l = 0; l < (iter + 1); l++) {
+    mant_a = mant_a - mant_b;
+    if (mant_a & 0x8000000000000000ULL) {
+      mant_a = mant_a + mant_b;
+      quot = quot + quot;
+    } else {
+      quot = quot + quot + 1;
+    }
+    mant_a = mant_a + mant_a;
+  }
+
+  /* Save current remainder */
+  mant_c = mant_a;
+  /* If remainder's mantissa is all zeroes, final result is zero. */
+  if (mant_c == 0) {
+    quot = quot & 7;
+    *quo = (sign_a ^ sign_b) ? -quot : quot;
+    aa = static_cast<unsigned long long int>(sign_c) << 63ULL;
+    memcpy(&a, &aa, sizeof(double));
+    return a;
+  }
+  /* Normalize result */
+  while (!(mant_c & 0x8000000000000000ULL)) {
+    mant_c = mant_c + mant_c;
+    expo_c--;
+  }
+  /* For IEEE remainder (quotient rounded to nearest-even we might need to 
+     do a final subtraction of the divisor from the remainder.
+  */
+  if (rem1 && ((expodiff+1) >= 0)) {
+    temp = mant_a - mant_b;
+    /* round quotient to nearest even */
+    if (((temp != 0ULL) && (!(temp & 0x8000000000000000ULL))) ||
+        ((temp == 0ULL) && (quot & 1))) {
+      mant_a = mant_a >> 1ULL;
+      quot++;
+      /* Since the divisor is greater than the remainder, the result will
+         have opposite sign of the dividend. To avoid a negative mantissa
+         when subtracting the divisor from remainder, reverse subtraction
+      */
+      sign_c = 1 ^ sign_c;
+      expo_c = expo_a - iter + 1;
+      mant_c = mant_b - mant_a;
+      /* normalize result */
+      while (!(mant_c & 0x8000000000000000ULL)) {
+        mant_c = mant_c + mant_c;
+        expo_c--;
+      }
+    }
+  }
+  /* package up result */
+  if (expo_c >= -1022) { /* normal */
+    mant_c = ((mant_c >> 11ULL) +
+              (((static_cast<unsigned long long>(sign_c)) << 63ULL) +
+               (((unsigned long long)(expo_c + 1022)) << 52ULL)));
+  } else { /* denormal */
+    mant_c = (((static_cast<unsigned long long>(sign_c)) << 63ULL) +
+              (mant_c >> (unsigned long long)(11 - expo_c - 1022)));
+  }
+  quot = quot & 7; /* mask quotient down to least significant three bits */
+  *quo = (sign_a ^ sign_b) ? -quot : quot;
+  memcpy(&a, &mant_c, sizeof(double));
+  return a;
+}
+
+__func__(double remainder(double a, double b))
+{
+  int quo;
+  return remquo (a, b, &quo);
+}
+
+__func__(double fma (double a, double b, double c))
+{
+  struct {
+    unsigned int lo;
+    unsigned int hi;
+  } xx, yy, zz, ww;
+  double d;
+  unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
+
+  memcpy(&xx, &a, sizeof(double));
+  memcpy(&yy, &b, sizeof(double));
+  memcpy(&zz, &c, sizeof(double));
+
+  expo_z = 0x7FFU;
+  t =  xx.hi >> 20;
+  expo_x = expo_z & t;
+  expo_x = expo_x - 1;    /* expo(x) - 1 */
+  t =  yy.hi >> 20;
+  expo_y = expo_z & t;
+  expo_y = expo_y - 1;    /* expo(y) - 1 */
+  t =  zz.hi >> 20;
+  expo_z = expo_z & t;
+  expo_z = expo_z - 1;    /* expo(z) - 1 */
+
+  if (!((expo_x <= 0x7FDU) &&
+        (expo_y <= 0x7FDU) &&
+        (expo_z <= 0x7FDU))) {
+    
+    /* fma (nan, y, z) --> nan
+       fma (x, nan, z) --> nan
+       fma (x, y, nan) --> nan 
+    */
+    if (((yy.hi << 1) | (yy.lo != 0)) > 0xffe00000U) {
+      yy.hi |= 0x00080000U;
+      memcpy(&d, &yy, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) > 0xffe00000U) {
+      zz.hi |= 0x00080000U;
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    if (((xx.hi << 1) | (xx.lo != 0)) > 0xffe00000U) {
+      xx.hi |= 0x00080000U;
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    
+    /* fma (0, inf, z) --> INDEFINITE
+       fma (inf, 0, z) --> INDEFINITE
+       fma (-inf,+y,+inf) --> INDEFINITE
+       fma (+x,-inf,+inf) --> INDEFINITE
+       fma (+inf,-y,+inf) --> INDEFINITE
+       fma (-x,+inf,+inf) --> INDEFINITE
+       fma (-inf,-y,-inf) --> INDEFINITE
+       fma (-x,-inf,-inf) --> INDEFINITE
+       fma (+inf,+y,-inf) --> INDEFINITE
+       fma (+x,+inf,-inf) --> INDEFINITE
+    */
+    if (((((xx.hi << 1) | xx.lo) == 0) && 
+         (((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U)) ||
+        ((((yy.hi << 1) | yy.lo) == 0) && 
+         (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U))) {
+      xx.hi = 0xfff80000U;
+      xx.lo = 0x00000000U;
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) == 0xffe00000U) {
+      if ((((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U) ||
+          (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U)) {
+        if ((int)(xx.hi ^ yy.hi ^ zz.hi) < 0) {
+          xx.hi = 0xfff80000U;
+          xx.lo = 0x00000000U;
+          memcpy(&d, &xx, sizeof(double));
+          return d;
+        }
+      }
+    }
+    /* fma (inf, y, z) --> inf
+       fma (x, inf, z) --> inf
+       fma (x, y, inf) --> inf
+    */
+    if (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U) {
+      xx.hi = xx.hi ^ (yy.hi & 0x80000000U);
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    if (((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U) {
+      yy.hi = yy.hi ^ (xx.hi & 0x80000000U);
+      memcpy(&d, &yy, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) == 0xffe00000U) {
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    /* fma (+0, -y, -0) --> -0
+       fma (-0, +y, -0) --> -0
+       fma (+x, -0, -0) --> -0
+       fma (-x, +0, -0) --> -0
+    */
+    if ((zz.hi == 0x80000000U) && (zz.lo == 0)) {
+      if ((((xx.hi << 1) | xx.lo) == 0) ||
+          (((yy.hi << 1) | yy.lo) == 0)) {
+        if ((int)(xx.hi ^ yy.hi) < 0) {
+          memcpy(&d, &zz, sizeof(double));
+          return d;
+        }
+      }
+    }
+    /* fma (0, y, 0) --> +0  (-0 if round down and signs of addend differ)
+       fma (x, 0, 0) --> +0  (-0 if round down and signs of addend differ)
+    */
+    if ((((zz.hi << 1) | zz.lo) == 0) &&
+        ((((xx.hi << 1) | xx.lo) == 0) ||
+         (((yy.hi << 1) | yy.lo) == 0))) {
+      zz.hi &= 0x7fffffffU;
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    
+    /* fma (0, y, z) --> z
+       fma (x, 0, z) --> z
+    */
+    if ((((xx.hi << 1) | xx.lo) == 0) ||
+        (((yy.hi << 1) | yy.lo) == 0)) {
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    
+    if (expo_x == 0xffffffffU) {
+      expo_x++;
+      t = xx.hi & 0x80000000U;
+      s = xx.lo >> 21;
+      xx.lo = xx.lo << 11;
+      xx.hi = xx.hi << 11;
+      xx.hi = xx.hi | s;
+      if (!xx.hi) {
+        xx.hi = xx.lo;
+        xx.lo = 0;
+        expo_x -= 32;
+      }
+      while (static_cast<int>(xx.hi) > 0) {
+        s = xx.lo >> 31;
+        xx.lo = xx.lo + xx.lo;
+        xx.hi = xx.hi + xx.hi;
+        xx.hi = xx.hi | s;
+        expo_x--;
+      }
+      xx.lo = (xx.lo >> 11);
+      xx.lo |= (xx.hi << 21);
+      xx.hi = (xx.hi >> 11) | t;
+    }
+    if (expo_y == 0xffffffffU) {
+      expo_y++;
+      t = yy.hi & 0x80000000U;
+      s = yy.lo >> 21;
+      yy.lo = yy.lo << 11;
+      yy.hi = yy.hi << 11;
+      yy.hi = yy.hi | s;
+      if (!yy.hi) {
+        yy.hi = yy.lo;
+        yy.lo = 0;
+        expo_y -= 32;
+      }
+      while (static_cast<int>(yy.hi) > 0) {
+        s = yy.lo >> 31;
+        yy.lo = yy.lo + yy.lo;
+        yy.hi = yy.hi + yy.hi;
+        yy.hi = yy.hi | s;
+        expo_y--;
+      }
+      yy.lo = (yy.lo >> 11);
+      yy.lo |= (yy.hi << 21);
+      yy.hi = (yy.hi >> 11) | t;
+    }
+    if (expo_z == 0xffffffffU) {
+      expo_z++;
+      t = zz.hi & 0x80000000U;
+      s = zz.lo >> 21;
+      zz.lo = zz.lo << 11;
+      zz.hi = zz.hi << 11;
+      zz.hi = zz.hi | s;
+      if (!zz.hi) {
+        zz.hi = zz.lo;
+        zz.lo = 0;
+        expo_z -= 32;
+      }
+      while (static_cast<int>(zz.hi) > 0) {
+        s = zz.lo >> 31;
+        zz.lo = zz.lo + zz.lo;
+        zz.hi = zz.hi + zz.hi;
+        zz.hi = zz.hi | s;
+        expo_z--;
+      }
+      zz.lo = (zz.lo >> 11);
+      zz.lo |= (zz.hi << 21);
+      zz.hi = (zz.hi >> 11) | t;
+    }
+  }
+  
+  expo_x = expo_x + expo_y;
+  expo_y = xx.hi ^ yy.hi;
+  t = xx.lo >> 21;
+  xx.lo = xx.lo << 11;
+  xx.hi = xx.hi << 11;
+  xx.hi = xx.hi | t;
+  yy.hi = yy.hi & 0x000fffffU;
+  xx.hi = xx.hi | 0x80000000U; /* set mantissa hidden bit */
+  yy.hi = yy.hi | 0x00100000U; /* set mantissa hidden bit */
+
+  prod0 = xx.lo * yy.lo;
+  prod1 =(unsigned)((static_cast<unsigned long long>(xx.lo)*static_cast<unsigned long long>(yy.lo))>>32ULL);
+  prod2 = xx.hi * yy.lo;
+  prod3 = xx.lo * yy.hi;
+  prod1 += prod2;
+  t = (unsigned)(prod1 < prod2);
+  prod1 += prod3;
+  t += prod1 < prod3;
+  prod2 =(unsigned)((static_cast<unsigned long long>(xx.hi)*static_cast<unsigned long long>(yy.lo))>>32ULL);
+  prod3 =(unsigned)((static_cast<unsigned long long>(xx.lo)*static_cast<unsigned long long>(yy.hi))>>32ULL);
+  prod2 += prod3;
+  s = (unsigned)(prod2 < prod3);
+  prod3 = xx.hi * yy.hi;
+  prod2 += prod3;
+  s += prod2 < prod3;
+  prod2 += t;
+  s += prod2 < t;
+  prod3 =(unsigned)((static_cast<unsigned long long>(xx.hi)*static_cast<unsigned long long>(yy.hi))>>32ULL);
+  prod3 = prod3 + s;
+  
+  yy.lo = prod0;                 /* mantissa */
+  yy.hi = prod1;                 /* mantissa */
+  xx.lo = prod2;                 /* mantissa */
+  xx.hi = prod3;                 /* mantissa */
+  expo_x = expo_x - (1023 - 2);  /* expo-1 */
+  expo_y = expo_y & 0x80000000U;  /* sign */
+
+  if (xx.hi < 0x00100000U) {
+    s = xx.lo >> 31;
+    s = (xx.hi << 1) + s;
+    xx.hi = s;
+    s = yy.hi >> 31;
+    s = (xx.lo << 1) + s;
+    xx.lo = s;
+    s = yy.lo >> 31;
+    s = (yy.hi << 1) + s;
+    yy.hi = s;
+    s = yy.lo << 1;
+    yy.lo = s;
+    expo_x--;
+  }
+
+  t = 0;
+  if (((zz.hi << 1) | zz.lo) != 0) { /* z is not zero */
+    
+    s = zz.hi & 0x80000000U;
+    
+    zz.hi &= 0x000fffffU;
+    zz.hi |= 0x00100000U;
+    ww.hi = 0;
+    ww.lo = 0;
+    
+    /* compare and swap. put augend into xx:yy */
+    if (static_cast<int>(expo_z) > static_cast<int>(expo_x)) {
+      t = expo_z;
+      expo_z = expo_x;
+      expo_x = t;
+      t = zz.hi;
+      zz.hi = xx.hi;
+      xx.hi = t;
+      t = zz.lo;
+      zz.lo = xx.lo;
+      xx.lo = t;
+      t = ww.hi;
+      ww.hi = yy.hi;
+      yy.hi = t;
+      t = ww.lo;
+      ww.lo = yy.lo;
+      yy.lo = t;
+      t = expo_y;
+      expo_y = s;
+      s = t;
+    }
+    
+    /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
+    /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
+    expo_z = expo_x - expo_z;
+    u = expo_y ^ s;
+    if (expo_z <= 107) {
+      /* denormalize addend */
+      t = 0;
+      while (expo_z >= 32) {
+        t     = ww.lo | (t != 0);
+        ww.lo = ww.hi;
+        ww.hi = zz.lo;
+        zz.lo = zz.hi;
+        zz.hi = 0;
+        expo_z -= 32;
+      }
+      if (expo_z) {
+        t     = (t     >> expo_z) | (ww.lo << (32 - expo_z)) | 
+                ((t << (32 - expo_z)) != 0);
+        ww.lo = (ww.lo >> expo_z) | (ww.hi << (32 - expo_z));
+        ww.hi = (ww.hi >> expo_z) | (zz.lo << (32 - expo_z));
+        zz.lo = (zz.lo >> expo_z) | (zz.hi << (32 - expo_z));
+        zz.hi = (zz.hi >> expo_z);
+      }
+    } else {
+      t = 1;
+      ww.lo = 0;
+      ww.hi = 0;
+      zz.lo = 0;
+      zz.hi = 0;
+    }
+    if (static_cast<int>(u) < 0) {
+      /* signs differ, effective subtraction */
+      t = (unsigned)(-static_cast<int>(t));
+      s = (unsigned)(t != 0);
+      u = yy.lo - s;
+      s = (unsigned)(u > yy.lo);
+      yy.lo = u - ww.lo;
+      s += yy.lo > u;
+      u = yy.hi - s;
+      s = (unsigned)(u > yy.hi);
+      yy.hi = u - ww.hi;
+      s += yy.hi > u;
+      u = xx.lo - s;
+      s = (unsigned)(u > xx.lo);
+      xx.lo = u - zz.lo;
+      s += xx.lo > u;
+      xx.hi = (xx.hi - zz.hi) - s;
+      if (!(xx.hi | xx.lo | yy.hi | yy.lo | t)) {
+        /* complete cancelation, return 0 */
+        memcpy(&d, &xx, sizeof(double));
+        return d;
+      }
+      if (static_cast<int>(xx.hi) < 0) {
+        /* Oops, augend had smaller mantissa. Negate mantissa and flip
+           sign of result
+        */
+        t = ~t;
+        yy.lo = ~yy.lo;
+        yy.hi = ~yy.hi;
+        xx.lo = ~xx.lo;
+        xx.hi = ~xx.hi;
+        if (++t == 0) {
+          if (++yy.lo == 0) {
+            if (++yy.hi == 0) {
+              if (++xx.lo == 0) {
+              ++xx.hi;
+              }
+            }
+          }
+        }
+        expo_y ^= 0x80000000U;
+      }
+        
+      /* normalize mantissa, if necessary */
+      while (!(xx.hi & 0x00100000U)) {
+        xx.hi = (xx.hi << 1) | (xx.lo >> 31);
+        xx.lo = (xx.lo << 1) | (yy.hi >> 31);
+        yy.hi = (yy.hi << 1) | (yy.lo >> 31);
+        yy.lo = (yy.lo << 1);
+        expo_x--;
+      }
+    } else {
+      /* signs are the same, effective addition */
+      yy.lo = yy.lo + ww.lo;
+      s = (unsigned)(yy.lo < ww.lo);
+      yy.hi = yy.hi + s;
+      u = (unsigned)(yy.hi < s);
+      yy.hi = yy.hi + ww.hi;
+      u += yy.hi < ww.hi;
+      xx.lo = xx.lo + u;
+      s = (unsigned)(xx.lo < u);
+      xx.lo = xx.lo + zz.lo;
+      s += xx.lo < zz.lo;
+      xx.hi = xx.hi + zz.hi + s;
+      if (xx.hi & 0x00200000U) {
+        t = t | (yy.lo << 31);
+        yy.lo = (yy.lo >> 1) | (yy.hi << 31);
+        yy.hi = (yy.hi >> 1) | (xx.lo << 31);
+        xx.lo = (xx.lo >> 1) | (xx.hi << 31);
+        xx.hi = ((xx.hi & 0x80000000U) | (xx.hi >> 1)) & ~0x40000000U;
+        expo_x++;
+      }
+    }
+  }
+  t = yy.lo | (t != 0);
+  t = yy.hi | (t != 0);
+        
+  xx.hi |= expo_y; /* or in sign bit */
+  if (expo_x <= 0x7FDU) {
+    /* normal */
+    xx.hi = xx.hi & ~0x00100000U; /* lop off integer bit */
+    s = xx.lo & 1; /* mantissa lsb */
+    u = xx.lo;
+    xx.lo += (t == 0x80000000U) ? s : (t >> 31);
+    xx.hi += (u > xx.lo);
+    xx.hi += ((expo_x + 1) << 20);
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  } else if (static_cast<int>(expo_x) >= 2046) {
+    /* overflow */
+    xx.hi = (xx.hi & 0x80000000U) | 0x7ff00000U;
+    xx.lo = 0;
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  }
+  /* subnormal */
+  expo_x = (unsigned)(-static_cast<int>(expo_x));
+  if (expo_x > 54) {
+    xx.hi = xx.hi & 0x80000000U;
+    xx.lo = 0;
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  }  
+  yy.hi = xx.hi &  0x80000000U;   /* save sign bit */
+  xx.hi = xx.hi & ~0xffe00000U;
+  if (expo_x >= 32) {
+    t = xx.lo | (t != 0);
+    xx.lo = xx.hi;
+    xx.hi = 0;
+    expo_x -= 32;
+  }
+  if (expo_x) {
+    t     = (t     >> expo_x) | (xx.lo << (32 - expo_x)) | (t != 0);
+    xx.lo = (xx.lo >> expo_x) | (xx.hi << (32 - expo_x));
+    xx.hi = (xx.hi >> expo_x);
+  }
+  expo_x = xx.lo & 1;
+  u = xx.lo;
+  xx.lo += (t == 0x80000000U) ? expo_x : (t >> 31);
+  xx.hi += (u > xx.lo);
+  xx.hi |= yy.hi;
+  memcpy(&d, &xx, sizeof(double));
+  return d;
+}
+
+__func__(double nextafter(double a, double b))
+{
+  unsigned long long int ia;
+  unsigned long long int ib;
+  memcpy(&ia, &a, sizeof(double));
+  memcpy(&ib, &b, sizeof(double));
+  if (__isnan(a) || __isnan(b)) return a + b; /* NaN */
+  if (((ia | ib) << 1ULL) == 0ULL) return b;
+  if (a == 0.0) {
+    return copysign (4.9406564584124654e-324, b); /* crossover */
+  }
+  if ((a < b) && (a < 0.0)) ia--;
+  if ((a < b) && (a > 0.0)) ia++;
+  if ((a > b) && (a < 0.0)) ia++;
+  if ((a > b) && (a > 0.0)) ia--;
+  memcpy(&a, &ia, sizeof(double));
+  return a;
+}
+
+__func__(double erf(double a))
+{
+  double t, r, q;
+
+  t = fabs(a);
+  if (t >= 1.0) {
+    r =        -1.28836351230756500E-019;
+    r = r * t + 1.30597472161093370E-017;
+    r = r * t - 6.33924401259620500E-016;
+    r = r * t + 1.96231865908940140E-014;
+    r = r * t - 4.35272243559990750E-013;
+    r = r * t + 7.37083927929352150E-012;
+    r = r * t - 9.91402142550461630E-011;
+    r = r * t + 1.08817017167760820E-009;
+    r = r * t - 9.93918713097634620E-009;
+    r = r * t + 7.66739923255145500E-008;
+    r = r * t - 5.05440278302806720E-007;
+    r = r * t + 2.87474157099000620E-006;
+    r = r * t - 1.42246725399722510E-005;
+    r = r * t + 6.16994555079419460E-005;
+    r = r * t - 2.36305221938908790E-004;
+    r = r * t + 8.05032844055371070E-004;
+    r = r * t - 2.45833366629108140E-003;
+    r = r * t + 6.78340988296706120E-003;
+    r = r * t - 1.70509103597554640E-002;
+    r = r * t + 3.93322852515666300E-002;
+    r = r * t - 8.37271292613764040E-002;
+    r = r * t + 1.64870423707623280E-001;
+    r = r * t - 2.99729521787681470E-001;
+    r = r * t + 4.99394435612628580E-001;
+    r = r * t - 7.52014596480123030E-001;
+    r = r * t + 9.99933138314926250E-001;
+    r = r * t - 1.12836725321102670E+000;
+    r = r * t + 9.99998988715182450E-001;
+    q = exp (-t * t);
+    r = 1.0 - r * q;
+    if (t >= 6.5) {
+      r = 1.0;
+    }    
+    a = copysign (r, a);
+  } else {
+    q = a * a;
+    r =        -7.77946848895991420E-010;
+    r = r * q + 1.37109803980285950E-008;
+    r = r * q - 1.62063137584932240E-007;
+    r = r * q + 1.64471315712790040E-006;
+    r = r * q - 1.49247123020098620E-005;
+    r = r * q + 1.20552935769006260E-004;
+    r = r * q - 8.54832592931448980E-004;
+    r = r * q + 5.22397760611847340E-003;
+    r = r * q - 2.68661706431114690E-002;
+    r = r * q + 1.12837916709441850E-001;
+    r = r * q - 3.76126389031835210E-001;
+    r = r * q + 1.12837916709551260E+000;
+    a = r * a;
+  }
+  return a;
+}
+
+__func__(double erfc(double a))
+{
+  double p, q, h, l;
+
+  if (a < 0.75) {
+    return 1.0 - erf(a);
+  } 
+  if (a > 27.3) {
+    return 0.0;
+  }
+  if (a < 5.0) {
+    double t;
+    t = 1.0 / a;
+    p =         1.9759923722227928E-008;
+    p = p * t - 1.0000002670474897E+000;
+    p = p * t - 7.4935303236347828E-001;
+    p = p * t - 1.5648136328071860E-001;
+    p = p * t + 1.2871196242447239E-001;
+    p = p * t + 1.1126459974811195E-001;
+    p = p * t + 4.0678642255914332E-002;
+    p = p * t + 7.9915414156678296E-003;
+    p = p * t + 7.1458332107840234E-004;
+    q =     t + 2.7493547525030619E+000;
+    q = q * t + 3.3984254815725423E+000;
+    q = q * t + 2.4635304979947761E+000;
+    q = q * t + 1.1405284734691286E+000;
+    q = q * t + 3.4130157606195649E-001;
+    q = q * t + 6.2250967676044953E-002;
+    q = q * t + 5.5661370941268700E-003;
+    q = q * t + 1.0575248365468671E-009;
+    p = p / q;
+    p = p * t;
+    h = ((int)(a * 16.0)) * 0.0625;
+    l = (a - h) * (a + h);
+    q = exp(-h * h) * exp(-l);
+    q = q * 0.5;
+    p = p * q + q;
+    p = p * t;
+  } else {
+    double ooa, ooasq;
+
+    ooa = 1.0 / a;
+    ooasq = ooa * ooa;
+    p =            -4.0025406686930527E+005;
+    p = p * ooasq + 1.4420582543942123E+005;
+    p = p * ooasq - 2.7664185780951841E+004;
+    p = p * ooasq + 4.1144611644767283E+003;
+    p = p * ooasq - 5.8706000519209351E+002;
+    p = p * ooasq + 9.1490086446323375E+001;
+    p = p * ooasq - 1.6659491387740221E+001;
+    p = p * ooasq + 3.7024804085481784E+000;
+    p = p * ooasq - 1.0578553994424316E+000;
+    p = p * ooasq + 4.2314218745087778E-001;
+    p = p * ooasq - 2.8209479177354962E-001;
+    p = p * ooasq + 5.6418958354775606E-001;
+    h = a * a;
+    h = ((int)(a * 16.0)) * 0.0625;
+    l = (a - h) * (a + h);
+    q = exp(-h * h) * exp(-l);
+    p = p * ooa;
+    p = p * q;
+  }
+  return p;
+}
+
+__func__(double lgamma(double a))
+{
+  double s;
+  double t;
+  double i;
+  double fa;
+  double sum;
+  long long int quot;
+  if (__isnan(a) || __isinf(a)) {
+    return a * a;
+  }
+  fa = fabs(a);
+  if (fa >= 3.0) {
+    if (fa >= 8.0) {
+      /* Stirling approximation; coefficients from Hart et al, "Computer 
+       * Approximations", Wiley 1968. Approximation 5404. 
+       */
+      s = 1.0 / fa;
+      t = s * s;
+      sum =          -0.1633436431e-2;
+      sum = sum * t + 0.83645878922e-3;
+      sum = sum * t - 0.5951896861197e-3;
+      sum = sum * t + 0.793650576493454e-3;
+      sum = sum * t - 0.277777777735865004e-2;
+      sum = sum * t + 0.833333333333331018375e-1;
+      sum = sum * s + 0.918938533204672;
+      s = 0.5 * log (fa);
+      t = fa - 0.5;
+      s = s * t;
+      t = s - fa;
+      s = s + sum;
+      t = t + s;
+    } else {
+      i = fa - 3.0;
+      s =        -4.02412642744125560E+003;
+      s = s * i - 2.97693796998962000E+005;
+      s = s * i - 6.38367087682528790E+006;
+      s = s * i - 5.57807214576539320E+007;
+      s = s * i - 2.24585140671479230E+008;
+      s = s * i - 4.70690608529125090E+008;
+      s = s * i - 7.62587065363263010E+008;
+      s = s * i - 9.71405112477113250E+008;
+      t =     i - 1.02277248359873170E+003;
+      t = t * i - 1.34815350617954480E+005;
+      t = t * i - 4.64321188814343610E+006;
+      t = t * i - 6.48011106025542540E+007;
+      t = t * i - 4.19763847787431360E+008;
+      t = t * i - 1.25629926018000720E+009;
+      t = t * i - 1.40144133846491690E+009;
+      t = s / t;
+      t = t + i;
+    }
+  } else if (fa >= 1.5) {
+    i = fa - 2.0;
+    t =         9.84839283076310610E-009;
+    t = t * i - 6.69743850483466500E-008;
+    t = t * i + 2.16565148880011450E-007;
+    t = t * i - 4.86170275781575260E-007;
+    t = t * i + 9.77962097401114400E-007;
+    t = t * i - 2.03041287574791810E-006;
+    t = t * i + 4.36119725805364580E-006;
+    t = t * i - 9.43829310866446590E-006;
+    t = t * i + 2.05106878496644220E-005;
+    t = t * i - 4.49271383742108440E-005;
+    t = t * i + 9.94570466342226000E-005;
+    t = t * i - 2.23154589559238440E-004;
+    t = t * i + 5.09669559149637430E-004;
+    t = t * i - 1.19275392649162300E-003;
+    t = t * i + 2.89051032936815490E-003;
+    t = t * i - 7.38555102806811700E-003;
+    t = t * i + 2.05808084278121250E-002;
+    t = t * i - 6.73523010532073720E-002;
+    t = t * i + 3.22467033424113040E-001;
+    t = t * i + 4.22784335098467190E-001;
+    t = t * i;
+  } else if (fa >= 0.7) {
+    i = 1.0 - fa;
+    t =         1.17786911519331130E-002;  
+    t = t * i + 3.89046747413522300E-002;
+    t = t * i + 5.90045711362049900E-002;
+    t = t * i + 6.02143305254344420E-002;
+    t = t * i + 5.61652708964839180E-002;
+    t = t * i + 5.75052755193461370E-002;
+    t = t * i + 6.21061973447320710E-002;
+    t = t * i + 6.67614724532521880E-002;
+    t = t * i + 7.14856037245421020E-002;
+    t = t * i + 7.69311251313347100E-002;
+    t = t * i + 8.33503129714946310E-002;
+    t = t * i + 9.09538288991182800E-002;
+    t = t * i + 1.00099591546322310E-001;
+    t = t * i + 1.11334278141734510E-001;
+    t = t * i + 1.25509666613462880E-001;
+    t = t * i + 1.44049896457704160E-001;
+    t = t * i + 1.69557177031481600E-001;
+    t = t * i + 2.07385551032182120E-001;
+    t = t * i + 2.70580808427600350E-001;
+    t = t * i + 4.00685634386517050E-001;
+    t = t * i + 8.22467033424113540E-001;
+    t = t * i + 5.77215664901532870E-001;
+    t = t * i;
+  } else {
+    t =         -9.04051686831357990E-008;
+    t = t * fa + 7.06814224969349250E-007;
+    t = t * fa - 3.80702154637902830E-007;
+    t = t * fa - 2.12880892189316100E-005;
+    t = t * fa + 1.29108470307156190E-004;
+    t = t * fa - 2.15932815215386580E-004;
+    t = t * fa - 1.16484324388538480E-003;
+    t = t * fa + 7.21883433044470670E-003;
+    t = t * fa - 9.62194579514229560E-003;
+    t = t * fa - 4.21977386992884450E-002;
+    t = t * fa + 1.66538611813682460E-001;
+    t = t * fa - 4.20026350606819980E-002;
+    t = t * fa - 6.55878071519427450E-001;
+    t = t * fa + 5.77215664901523870E-001;
+    t = t * fa;
+    t = t * fa + fa;
+    t = -log (t);
+  }
+  if (a >= 0.0) return t;
+  if (fa < 1e-19) return -log(fa);
+  i = floor(fa);       
+  if (fa == i) return 1.0 / (fa - i); /* a is an integer: return infinity */
+  i = rint (2.0 * fa);
+  quot = static_cast<long long int>(i);
+  i = fa - 0.5 * i;
+  i = i * CUDART_PI;
+  if (quot & 1) {
+    i = cos(i);
+  } else {
+    i = sin(i);
+  }
+  i = fabs(i);
+  t = log(CUDART_PI / (i * fa)) - t;
+  return t;
+}
+
+__func__(unsigned long long int __internal_host_nan_kernel(const char *s))
+{
+  unsigned long long i = 0;
+  int c;
+  int ovfl = 0;
+  int invld = 0;
+  if (s && (*s == '0')) {
+    s++;
+    if ((*s == 'x') || (*s == 'X')) {
+      s++; 
+      while (*s == '0') s++;
+      while (*s) {
+        if (i > 0x0fffffffffffffffULL) {
+          ovfl = 1;
+        }
+        c = (((*s) >= 'A') && ((*s) <= 'F')) ? (*s + 'a' - 'A') : (*s);
+        if ((c >= 'a') && (c <= 'f')) { 
+          c = c - 'a' + 10;
+          i = i * 16 + c;
+        } else if ((c >= '0') && (c <= '9')) { 
+          c = c - '0';
+          i = i * 16 + c;
+        } else {
+          invld = 1;
+        }
+        s++;
+      }
+    } else {
+      while (*s == '0') s++;
+      while (*s) {
+        if (i > 0x1fffffffffffffffULL) {
+          ovfl = 1;
+        }
+        c = *s;
+        if ((c >= '0') && (c <= '7')) { 
+          c = c - '0';
+          i = i * 8 + c;
+        } else {
+          invld = 1; 
+        }
+        s++;
+      }
+    }
+  } else if (s) {
+    while (*s) {
+      c = *s;
+      if ((i > 1844674407370955161ULL) || 
+          ((i == 1844674407370955161ULL) && (c > '5'))) {
+        ovfl = 1;
+      }
+      if ((c >= '0') && (c <= '9')) { 
+        c = c - '0';
+        i = i * 10 + c;
+      } else {
+        invld = 1;
+      }
+      s++;
+    }
+  }
+  if (ovfl) {
+    i = ~0ULL;
+  }
+  if (invld) {
+    i = 0ULL;
+  }
+  i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL;
+  return i;
+}
+
+__func__(double nan(const char *tagp))
+{
+  unsigned long long l;
+  double d;
+  l = __internal_host_nan_kernel(tagp);
+  memcpy(&d, &l, sizeof(double));
+  return d;
+}
+
+__func__(double __host_tgamma_kernel(double a))
+{
+  double t;
+  t =       - 4.4268934071252475E-010;
+  t = t * a - 2.0266591846658954E-007;
+  t = t * a + 1.1381211721119527E-006;
+  t = t * a - 1.2507734816630748E-006;
+  t = t * a - 2.0136501740408771E-005;
+  t = t * a + 1.2805012607354486E-004;
+  t = t * a - 2.1524140811527418E-004;
+  t = t * a - 1.1651675459704604E-003;
+  t = t * a + 7.2189432248466381E-003;
+  t = t * a - 9.6219715326862632E-003;
+  t = t * a - 4.2197734554722394E-002;
+  t = t * a + 1.6653861138250356E-001;
+  t = t * a - 4.2002635034105444E-002;
+  t = t * a - 6.5587807152025712E-001;
+  t = t * a + 5.7721566490153287E-001;
+  t = t * a + 1.0000000000000000E+000;
+  return t;
+}
+
+__func__(double __host_stirling_poly(double a))
+{
+  double x = 1.0 / a;
+  double z = 0.0;
+  z =       + 8.3949872067208726e-004;
+  z = z * x - 5.1717909082605919e-005;
+  z = z * x - 5.9216643735369393e-004;
+  z = z * x + 6.9728137583658571e-005;
+  z = z * x + 7.8403922172006662e-004;
+  z = z * x - 2.2947209362139917e-004;
+  z = z * x - 2.6813271604938273e-003;
+  z = z * x + 3.4722222222222220e-003;
+  z = z * x + 8.3333333333333329e-002;
+  z = z * x + 1.0000000000000000e+000;
+  return z;
+}
+
+__func__(double __host_tgamma_stirling(double a))
+{
+  double z;
+  double x;
+  z = __host_stirling_poly (a);
+  if (a < 142.0) {
+    x = pow (a, a - 0.5);
+    a = x * exp (-a);
+    a = a * CUDART_SQRT_2PI;
+    return a * z;
+  } else if (a < 172.0) {
+    x = pow (a, 0.5 * a - 0.25);
+    a = x * exp (-a);
+    a = a * CUDART_SQRT_2PI;
+    a = a * z;
+    return a * x;
+  } else {
+    return exp(1000.0); /* INF */
+  }
+}
+
+__func__(double tgamma(double a))
+{
+  double s, xx, x = a;
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (fabs(x) < 20.0) {
+    if (x >= 0.0) {
+      s = 1.0;
+      xx = x;
+      while (xx > 1.5) {
+        xx = xx - 1.0;
+        s = s * xx;
+      }
+      if (x >= 0.5) {
+        xx = xx - 1.0;
+      }
+      xx = __host_tgamma_kernel (xx);
+      if (x < 0.5) {
+        xx = xx * x;
+      }
+      s = s / xx;
+    } else {
+      xx = x;
+      s = xx;
+      if (x == floor(x)) {
+        return 0.0 / (x - floor(x));
+      }
+      while (xx < -0.5) {
+        xx = xx + 1.0;
+        s = s * xx;
+      }
+      xx = __host_tgamma_kernel (xx);
+      s = s * xx;
+      s = 1.0 / s;
+    }
+    return s;
+  } else {
+    if (x >= 0.0) {
+      return __host_tgamma_stirling (x);
+    } else {
+      double t;
+      int quot;
+      if (x == floor(x)) {
+        return 0.0 / (x - floor(x));
+      }
+      if (x < -185.0) {
+        int negative;
+        x = floor(x);
+        negative = ((x - (2.0 * floor(0.5 * x))) == 1.0);
+        return negative ? (-1.0 / 1e308 / 1e308) : CUDART_ZERO;
+      }
+      /* compute sin(pi*x) accurately */
+      xx = rint (2.0 * x);
+      quot = static_cast<int>(xx);
+      xx = -0.5 * xx + x;
+      xx = xx * CUDART_PI;
+      if (quot & 1) {
+        xx = cos (xx);
+      } else {
+        xx = sin (xx);
+      }
+      if (quot & 2) {
+        xx = -xx;
+      }
+      x = fabs (x);
+      s = exp (-x);
+      t = x - 0.5;
+      if (x > 140.0) t = 0.5 * t;
+      t = pow (x, t);
+      if (x > 140.0) s = s * t;
+      s = s * __host_stirling_poly (x);
+      s = s * x;
+      s = s * xx;
+      s = 1.0 / s;
+      s = s * CUDART_SQRT_PIO2;
+      s = s / t;
+      return s;
+    }
+  }
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR FLOAT AND LONG DOUBLE ROUTINES FOR WINDOWS PLATFORM  *
+* MAP FLOAT AND LONG DOUBLE ROUTINES TO DOUBLE ROUTINES                        *
+*                                                                              *
+*******************************************************************************/
+
+__func__(int __signbitl(const long double a))
+{
+  return __signbit(static_cast<double>(a));
+}
+
+__func__(int __signbitf(const float a))
+{
+  return __signbit(static_cast<double>(a));
+}
+
+__func__(int __finitel(const long double a))
+{
+  return __finite(static_cast<double>(a));
+}
+
+__func__(int __finitef(const float a))
+{
+  return __finite(static_cast<double>(a));
+}
+
+__func__(int __isinfl(const long double a))
+{
+  return __isinf(static_cast<double>(a));
+}
+
+__func__(int __isinff(const float a))
+{
+  return __isinf(static_cast<double>(a));
+}
+
+__func__(int __isnanl(const long double a))
+{
+  return __isnan(static_cast<double>(a));
+}
+
+__func__(int __isnanf(const float a))
+{
+  return __isnan(static_cast<double>(a));
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(float fmaxf(const float a, const float b))
+{
+  return static_cast<float>(fmax(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float fminf(const float a, const float b))
+{
+  return static_cast<float>(fmin(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float roundf(const float a))
+{
+  return static_cast<float>(round(static_cast<double>(a)));
+}
+
+__func__(long int lroundf(const float a))
+{
+  return lround(static_cast<double>(a));
+}
+
+__func__(long long int llroundf(const float a))
+{
+  return llround(static_cast<double>(a));
+}
+
+__func__(float truncf(const float a))
+{
+  return static_cast<float>(trunc(static_cast<double>(a)));
+}
+
+__func__(float rintf(const float a))
+{
+  return static_cast<float>(rint(static_cast<double>(a)));
+}
+
+__func__(float nearbyintf(const float a))
+{
+  return static_cast<float>(nearbyint(static_cast<double>(a)));
+}
+
+__func__(long int lrintf(const float a))
+{
+  return lrint(static_cast<double>(a));
+}
+
+__func__(long long int llrintf(const float a))
+{
+  return llrint(static_cast<double>(a));
+}
+
+__func__(float logbf(const float a))
+{
+  return static_cast<float>(logb(static_cast<double>(a)));
+}
+
+__func__(float scalblnf(const float a, const long int b))
+{
+  return static_cast<float>(scalbln(static_cast<double>(a), b));
+}
+
+__func__(float log2f(const float a))
+{
+  return static_cast<float>(log2(static_cast<double>(a)));
+}
+
+__func__(float exp2f(const float a))
+{
+  return static_cast<float>(exp2(static_cast<double>(a)));
+}
+
+__func__(float acoshf(const float a))
+{
+  return static_cast<float>(acosh(static_cast<double>(a)));
+}
+
+__func__(float asinhf(const float a))
+{
+  return static_cast<float>(asinh(static_cast<double>(a)));
+}
+
+__func__(float atanhf(const float a))
+{
+  return static_cast<float>(atanh(static_cast<double>(a)));
+}
+
+__func__(float cbrtf(const float a))
+{
+  return static_cast<float>(cbrt(static_cast<double>(a)));
+}
+
+__func__(float expm1f(const float a))
+{
+  return static_cast<float>(expm1(static_cast<double>(a)));
+}
+
+__func__(float fdimf(const float a, const float b))
+{
+  return static_cast<float>(fdim(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float log1pf(const float a))
+{
+  return static_cast<float>(log1p(static_cast<double>(a)));
+}
+
+__func__(float scalbnf(const float a, const int b))
+{
+  return static_cast<float>(scalbn(static_cast<double>(a), b));
+}
+
+__func__(float fmaf(const float a, const float b, const float c))
+{
+  return static_cast<float>(fma(static_cast<double>(a), static_cast<double>(b), static_cast<double>(c)));
+}
+
+__func__(int ilogbf(const float a))
+{
+  return ilogb(static_cast<double>(a));
+}
+
+__func__(float erff(const float a))
+{
+  return static_cast<float>(erf(static_cast<double>(a)));
+}
+
+__func__(float erfcf(const float a))
+{
+  return static_cast<float>(erfc(static_cast<double>(a)));
+}
+
+__func__(float lgammaf(const float a))
+{
+  return static_cast<float>(lgamma(static_cast<double>(a)));
+}
+
+__func__(float tgammaf(const float a))
+{
+  return static_cast<float>(tgamma(static_cast<double>(a)));
+}
+
+__func__(float remquof(const float a, const float b, int *quo))
+{
+  return static_cast<float>(remquo(static_cast<double>(a), static_cast<double>(b), quo));
+}
+
+__func__(float remainderf(const float a, const float b))
+{
+  return static_cast<float>(remainder(static_cast<double>(a), static_cast<double>(b)));
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (defined _MSC_VER) && (_MSC_VER >= 1700)
+__func__(float j0f(const float a))
+{
+  return static_cast<float>(_j0(static_cast<double>(a)));
+}
+
+__func__(float j1f(const float a))
+{
+  return static_cast<float>(_j1(static_cast<double>(a)));
+}
+
+__func__(float jnf(const int n, const float a))
+{
+  return static_cast<float>(_jn(n, static_cast<double>(a)));
+}
+
+__func__(float y0f(const float a))
+{
+  return static_cast<float>(_y0(static_cast<double>(a)));
+}
+
+__func__(float y1f(const float a))
+{
+  return static_cast<float>(_y1(static_cast<double>(a)));
+}
+
+__func__(float ynf(const int n, const float a))
+{
+  return static_cast<float>(_yn(n, static_cast<double>(a)));
+}
+#endif /* (defined _MSC_VER) && (_MSC_VER >= 1700) */
+
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR FLOAT ROUTINES FOR WINDOWS PLATFORM                  *
+*                                                                              *
+*******************************************************************************/
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(float copysignf(float a, const float b))
+{
+  unsigned int aa, bb;
+  memcpy(&aa, &a, sizeof(float));
+  memcpy(&bb, &b, sizeof(float));
+  aa = (aa & ~0x80000000U) | (bb & 0x80000000U);
+  memcpy(&a, &aa, sizeof(float));
+  return a;
+}
+
+__func__(float nextafterf(float a, const float b))
+{
+  unsigned int ia;
+  unsigned int ib;
+  memcpy(&ia, &a, sizeof(float));
+  memcpy(&ib, &b, sizeof(float));
+  if (__isnanf(a) || __isnanf(b)) return a + b; /*NaN*/
+  if (((ia | ib) << 1U) == 0U) return b;
+  if (a == 0.0F) {
+    return copysignf(1.401298464e-045F, b); /*crossover*/
+  }
+  if ((a < b) && (a < 0.0F)) ia--;
+  if ((a < b) && (a > 0.0F)) ia++;
+  if ((a > b) && (a < 0.0F)) ia++;
+  if ((a > b) && (a > 0.0F)) ia--;
+  memcpy(&a, &ia, sizeof(float));
+  return a;
+}
+
+__func__(float nanf(const char *tagp))
+{
+  float f;
+  unsigned int i;
+  i = static_cast<unsigned int>(__internal_host_nan_kernel(tagp));
+  i = (i & 0x007fffffU) | 0x7fc00000U;
+  memcpy(&f, &i, sizeof(float));
+  return f;
+}
+
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* _WIN32 */
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE AND FLOAT ROUTINES. ALL PLATFORMS             *
+*                                                                              *
+*******************************************************************************/
+
+__func__(double rsqrt(const double a))
+{
+  return 1.0 / sqrt(a);
+}
+
+__func__(double rcbrt(const double a))
+{
+  double s, t;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a == 0.0 || __isinf(a)) {
+    return 1.0 / a;
+  } 
+  s = fabs(a);
+  t = exp2(-CUDART_THIRD * log2(s));                /* initial approximation */
+  t = ((t*t) * (-s*t) + 1.0) * (CUDART_THIRD*t) + t;/* refine approximation */
+#if defined(__APPLE__)
+  if (__signbitd(a))
+#else /* __APPLE__ */
+  if (__signbit(a))
+#endif /* __APPLE__ */
+  {
+    t = -t;
+  }
+  return t;
+}
+
+__func__(double sinpi(double a))
+{
+  int n;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a == 0.0 || __isinf(a)) {
+    return sin (a);
+  } 
+  if (a == floor(a)) {
+    return ((a / 1.0e308) / 1.0e308) / 1.0e308;
+  }
+  double twoa = a + a;
+  double rtwoa = round(twoa);
+  long long int l = (long long int)rtwoa;
+  n = (int)l;
+  a -= rtwoa * 0.5;
+  a = a * CUDART_PI;
+  if (n & 1) {
+    a = cos (a);
+  } else {
+    a = sin (a);
+  }
+  if (n & 2) {
+    a = -a;
+  }
+  return a;
+}
+
+__func__(double cospi(double a))
+{
+  int n;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (__isinf(a)) {
+    return cos (a);
+  } 
+  if (fabs(a) > 9.0071992547409920e+015) {
+    a = 0.0;
+  }
+  double twoa = a + a;
+  double rtwoa = round(twoa);
+  long long int l = (long long int)rtwoa;
+  n = (int)l;
+  a -= rtwoa * 0.5;
+  a = a * CUDART_PI;
+  n++;
+  if (n & 1) {
+    a = cos (a);
+  } else {
+    a = sin (a);
+  }
+  if (n & 2) {
+    a = -a;
+  }
+  if (a == 0.0) {
+    a = fabs(a);
+  }
+  return a;
+}
+
+__func__(void sincospi(const double a, double *sptr, double *cptr))
+{
+  *sptr = sinpi(a);
+  *cptr = cospi(a);
+}
+
+__func__(double erfinv(const double a))
+{
+  double p, q, t, fa;
+  unsigned long long int l;
+
+  fa = fabs(a);
+  if (fa >= 1.0) {
+    l = 0xfff8000000000000ULL;
+    memcpy(&t, &l, sizeof(double)); /* INDEFINITE */
+    if (fa == 1.0) {
+      t = a * exp(1000.0);          /* Infinity */
+    }
+  } else if (fa >= 0.9375) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
+     */
+    t = log1p(-fa);
+    t = 1.0 / sqrt(-t);
+    p =         2.7834010353747001060e-3;
+    p = p * t + 8.6030097526280260580e-1;
+    p = p * t + 2.1371214997265515515e+0;
+    p = p * t + 3.1598519601132090206e+0;
+    p = p * t + 3.5780402569085996758e+0;
+    p = p * t + 1.5335297523989890804e+0;
+    p = p * t + 3.4839207139657522572e-1;
+    p = p * t + 5.3644861147153648366e-2;
+    p = p * t + 4.3836709877126095665e-3;
+    p = p * t + 1.3858518113496718808e-4;
+    p = p * t + 1.1738352509991666680e-6;
+    q =     t + 2.2859981272422905412e+0;
+    q = q * t + 4.3859045256449554654e+0;
+    q = q * t + 4.6632960348736635331e+0;
+    q = q * t + 3.9846608184671757296e+0;
+    q = q * t + 1.6068377709719017609e+0;
+    q = q * t + 3.5609087305900265560e-1;
+    q = q * t + 5.3963550303200816744e-2;
+    q = q * t + 4.3873424022706935023e-3;
+    q = q * t + 1.3858762165532246059e-4;
+    q = q * t + 1.1738313872397777529e-6;
+    t = p / (q * t);
+    if (a < 0.0) t = -t;
+  } else if (fa >= 0.75) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 39
+    */
+    t = a * a - .87890625;
+    p =         .21489185007307062000e+0;
+    p = p * t - .64200071507209448655e+1;
+    p = p * t + .29631331505876308123e+2;
+    p = p * t - .47644367129787181803e+2;
+    p = p * t + .34810057749357500873e+2;
+    p = p * t - .12954198980646771502e+2;
+    p = p * t + .25349389220714893917e+1;
+    p = p * t - .24758242362823355486e+0;
+    p = p * t + .94897362808681080020e-2;
+    q =     t - .12831383833953226499e+2;
+    q = q * t + .41409991778428888716e+2;
+    q = q * t - .53715373448862143349e+2;
+    q = q * t + .33880176779595142685e+2;
+    q = q * t - .11315360624238054876e+2;
+    q = q * t + .20369295047216351160e+1;
+    q = q * t - .18611650627372178511e+0;
+    q = q * t + .67544512778850945940e-2;
+    p = p / q;
+    t = a * p;
+  } else {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 18
+    */
+    t = a * a - .5625;
+    p =       - .23886240104308755900e+2;
+    p = p * t + .45560204272689128170e+3;
+    p = p * t - .22977467176607144887e+4;
+    p = p * t + .46631433533434331287e+4;
+    p = p * t - .43799652308386926161e+4;
+    p = p * t + .19007153590528134753e+4;
+    p = p * t - .30786872642313695280e+3;
+    q =     t - .83288327901936570000e+2;
+    q = q * t + .92741319160935318800e+3;
+    q = q * t - .35088976383877264098e+4;
+    q = q * t + .59039348134843665626e+4;
+    q = q * t - .48481635430048872102e+4;
+    q = q * t + .18997769186453057810e+4;
+    q = q * t - .28386514725366621129e+3;
+    p = p / q;
+    t = a * p;
+  }
+  return t;
+}
+
+__func__(double erfcinv(const double a))
+{
+  double t;
+  unsigned long long int l;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a <= 0.0) {
+    l = 0xfff8000000000000ULL;
+    memcpy(&t, &l, sizeof(double));   /* INDEFINITE */
+    if (a == 0.0) {
+        t = (1.0 - a) * exp(1000.0);  /* Infinity */
+    }
+  } 
+  else if (a >= 0.0625) {
+    t = erfinv (1.0 - a);
+  }
+  else if (a >= 1e-100) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
+    */
+    double p, q;
+    t = log(a);
+    t = 1.0 / sqrt(-t);
+    p =         2.7834010353747001060e-3;
+    p = p * t + 8.6030097526280260580e-1;
+    p = p * t + 2.1371214997265515515e+0;
+    p = p * t + 3.1598519601132090206e+0;
+    p = p * t + 3.5780402569085996758e+0;
+    p = p * t + 1.5335297523989890804e+0;
+    p = p * t + 3.4839207139657522572e-1;
+    p = p * t + 5.3644861147153648366e-2;
+    p = p * t + 4.3836709877126095665e-3;
+    p = p * t + 1.3858518113496718808e-4;
+    p = p * t + 1.1738352509991666680e-6;
+    q =     t + 2.2859981272422905412e+0;
+    q = q * t + 4.3859045256449554654e+0;
+    q = q * t + 4.6632960348736635331e+0;
+    q = q * t + 3.9846608184671757296e+0;
+    q = q * t + 1.6068377709719017609e+0;
+    q = q * t + 3.5609087305900265560e-1;
+    q = q * t + 5.3963550303200816744e-2;
+    q = q * t + 4.3873424022706935023e-3;
+    q = q * t + 1.3858762165532246059e-4;
+    q = q * t + 1.1738313872397777529e-6;
+    t = p / (q * t);
+  }
+  else {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82
+    */
+    double p, q;
+    t = log(a);
+    t = 1.0 / sqrt(-t);
+    p =         6.9952990607058154858e-1;
+    p = p * t + 1.9507620287580568829e+0;
+    p = p * t + 8.2810030904462690216e-1;
+    p = p * t + 1.1279046353630280005e-1;
+    p = p * t + 6.0537914739162189689e-3;
+    p = p * t + 1.3714329569665128933e-4;
+    p = p * t + 1.2964481560643197452e-6;
+    p = p * t + 4.6156006321345332510e-9;
+    p = p * t + 4.5344689563209398450e-12;
+    q =     t + 1.5771922386662040546e+0;
+    q = q * t + 2.1238242087454993542e+0;
+    q = q * t + 8.4001814918178042919e-1;
+    q = q * t + 1.1311889334355782065e-1;
+    q = q * t + 6.0574830550097140404e-3;
+    q = q * t + 1.3715891988350205065e-4;
+    q = q * t + 1.2964671850944981713e-6;
+    q = q * t + 4.6156017600933592558e-9;
+    q = q * t + 4.5344687377088206783e-12;
+    t = p / (q * t);
+  }
+  return t;
+}
+
+__func__(double normcdfinv(const double a))
+{
+  return -1.4142135623730951 * erfcinv(a + a);
+}
+
+__func__(double normcdf(double a))
+{
+  double ah, al, t1, t2, u1, u2, v1, v2, z;
+  if (fabs (a) > 38.5) a = copysign (38.5, a);
+  ah = a * 134217729.0;
+  u1 = (a - ah) + ah;
+  u2 = a - u1;
+  v1 = -7.0710678398609161e-01;
+  v2 =  2.7995440410322203e-09;
+  t1 = a * -CUDART_SQRT_HALF_HI;
+  t2 = (((u1 * v1 - t1) + u1 * v2) + u2 * v1) + u2 * v2;
+  t2 = (a * -CUDART_SQRT_HALF_LO) + t2;
+  ah = t1 + t2;
+  z = erfc (ah);
+  if (a < -1.0) {
+    al = (t1 - ah) + t2;
+    t1 = -2.0 * ah * z;
+    z = t1 * al + z;
+  }
+  return 0.5 * z;
+}
+
+__func__(double erfcx(const double a))
+{
+  double x, t1, t2, t3;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  x = fabs(a); 
+  if (x < 32.0) {
+    /*  
+     * This implementation of erfcx() is based on the algorithm in: M. M. 
+     * Shepherd and J. G. Laframboise, "Chebyshev Approximation of (1 + 2x)
+     * exp(x^2)erfc x in 0 <= x < INF", Mathematics of Computation, Vol. 
+     * 36, No. 153, January 1981, pp. 249-253. For the core approximation,
+     * the input domain [0,INF] is transformed via (x-k) / (x+k) where k is
+     * a precision-dependent constant. Here, we choose k = 4.0, so the input 
+     * domain [0, 27.3] is transformed into the core approximation domain 
+     * [-1, 0.744409].   
+     */
+    /*
+    // Compute (1+2*x)*exp(x*x)*erfc(x)
+    */
+    /* t2 = (x-4.0)/(x+4.0), transforming [0,INF] to [-1,+1] */ 
+    t1 = x - 4.0; 
+    t2 = x + 4.0; 
+    t2 = t1 / t2;
+    /* approximate on [-1, 0.744409] */   
+    t1 =         - 3.5602694826817400E-010; 
+    t1 = t1 * t2 - 9.7239122591447274E-009; 
+    t1 = t1 * t2 - 8.9350224851649119E-009; 
+    t1 = t1 * t2 + 1.0404430921625484E-007; 
+    t1 = t1 * t2 + 5.8806698585341259E-008; 
+    t1 = t1 * t2 - 8.2147414929116908E-007; 
+    t1 = t1 * t2 + 3.0956409853306241E-007; 
+    t1 = t1 * t2 + 5.7087871844325649E-006; 
+    t1 = t1 * t2 - 1.1231787437600085E-005; 
+    t1 = t1 * t2 - 2.4399558857200190E-005; 
+    t1 = t1 * t2 + 1.5062557169571788E-004; 
+    t1 = t1 * t2 - 1.9925637684786154E-004; 
+    t1 = t1 * t2 - 7.5777429182785833E-004; 
+    t1 = t1 * t2 + 5.0319698792599572E-003; 
+    t1 = t1 * t2 - 1.6197733895953217E-002; 
+    t1 = t1 * t2 + 3.7167515553018733E-002; 
+    t1 = t1 * t2 - 6.6330365827532434E-002; 
+    t1 = t1 * t2 + 9.3732834997115544E-002; 
+    t1 = t1 * t2 - 1.0103906603555676E-001; 
+    t1 = t1 * t2 + 6.8097054254735140E-002; 
+    t1 = t1 * t2 + 1.5379652102605428E-002; 
+    t1 = t1 * t2 - 1.3962111684056291E-001; 
+    t1 = t1 * t2 + 1.2329951186255526E+000; 
+    /*
+    // Note: (1+2*x)*exp(x*x)*erfc(x) / (1+2*x) = exp(x*x)*erfc(x)
+    */
+    t2 = 2.0 * x + 1.0; 
+    t1 = t1 / t2;
+  } else {
+    /* asymptotic expansion for large aguments */
+    t2 = 1.0 / x;
+    t3 = t2 * t2;
+    t1 =         -29.53125;
+    t1 = t1 * t3 + 6.5625;
+    t1 = t1 * t3 - 1.875;
+    t1 = t1 * t3 + 0.75;
+    t1 = t1 * t3 - 0.5;
+    t1 = t1 * t3 + 1.0;
+    t2 = t2 * 5.6418958354775628e-001;
+    t1 = t1 * t2;
+  }
+  if (a < 0.0) {
+    /*
+    // Note: erfcx(x) = 2*exp(x^2) - erfcx(|x|)
+    */
+    t2 = (static_cast<int>(x * 16.0)) * 0.0625;
+    t3 = (x - t2) * (x + t2);
+    t3 = exp(t2 * t2) * exp(t3);
+    t3 = t3 + t3;
+    t1 = t3 - t1;
+  }
+  return t1;
+}
+
+__func__(float rsqrtf(const float a))
+{
+  return static_cast<float>(rsqrt(static_cast<double>(a)));
+}
+
+__func__(float rcbrtf(const float a))
+{
+  return static_cast<float>(rcbrt(static_cast<double>(a)));
+}
+
+__func__(float sinpif(const float a))
+{
+  return static_cast<float>(sinpi(static_cast<double>(a)));
+}
+
+__func__(float cospif(const float a))
+{
+  return static_cast<float>(cospi(static_cast<double>(a)));
+}
+
+__func__(void sincospif(const float a, float *sptr, float *cptr))
+{
+  double s, c;
+
+  sincospi(static_cast<double>(a), &s, &c);
+  *sptr = static_cast<float>(s);
+  *cptr = static_cast<float>(c);
+}
+
+__func__(float erfinvf(const float a))
+{
+  return static_cast<float>(erfinv(static_cast<double>(a)));
+}
+
+__func__(float erfcinvf(const float a))
+{
+  return static_cast<float>(erfcinv(static_cast<double>(a)));
+}
+
+__func__(float normcdfinvf(const float a))
+{
+  return static_cast<float>(normcdfinv(static_cast<double>(a)));
+}
+
+__func__(float normcdff(const float a))
+{
+  return static_cast<float>(normcdf(static_cast<double>(a)));
+}
+
+__func__(float erfcxf(const float a))
+{
+  return static_cast<float>(erfcx(static_cast<double>(a)));
+}
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* _WIN32 */
+
+#endif /* !__CUDACC__ */
+
+#endif /* !__MATH_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3203a8223bb1571deeefd0f8985ae079807aa08
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.h
@@ -0,0 +1,754 @@
+/*
+ * Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/mma.h is an internal header file and must not be used directly.  Please use mma.h instead.")
+#else
+#warning "crt/mma.h is an internal header file and must not be used directly.  Please use mma.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
+#endif
+
+#if !defined(__CUDA_MMA_H__)
+#define __CUDA_MMA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
+#define __CUDA_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
+#define __CUDA_SUBBYTE_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+#define __CUDA_AMPERE_MMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
+
+namespace nvcuda {
+namespace wmma {
+  
+  // utility functions
+#ifdef __CUDA_AMPERE_MMA__
+  inline __device__ float __float_to_tf32(float in) 
+  { 
+    float ret; 
+    asm("{\n  .reg .b32 __$1;"
+        "\n   cvt.rna.tf32.f32 __$1, %1;"
+        "\n   mov.b32 %0, __$1;\n}\n" : "=f"(ret) : "f"(in) ); 
+    return ret; 
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */  
+  
+  // 
+  // tags 
+  // 
+  struct row_major;
+  struct col_major;
+  struct matrix_a;
+  struct matrix_b;
+  struct accumulator;
+
+#ifdef __CUDA_AMPERE_MMA__
+  namespace precision {
+    struct tf32;
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */  
+#ifdef __CUDA_SUBBYTE_IMMA__
+  namespace experimental {
+    namespace precision {
+      struct u4; // 4-bit unsigned
+      struct s4; // 4-bit signed
+      struct b1; // 1-bit
+    }
+    enum bmmaBitOp { bmmaBitOpXOR = 1
+#ifdef __CUDA_AMPERE_MMA__
+                    , bmmaBitOpAND = 2
+#endif  /* __CUDA_AMPERE_MMA__ */
+    };
+    enum bmmaAccumulateOp { bmmaAccumulateOpPOPC = 1 };
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+  // 
+  // layout
+  //
+  enum layout_t {
+    mem_row_major, mem_col_major
+  };
+  
+  template <typename T>
+  struct helper_traits {
+    typedef T element_type;
+    typedef T storage_element_type;
+    typedef T fill_argument_type;
+  };
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  template<> struct helper_traits<experimental::precision::u4> {
+    typedef experimental::precision::u4 element_type;
+    typedef unsigned int storage_element_type;
+    typedef unsigned int fill_argument_type;
+  };
+
+  template<> struct helper_traits<experimental::precision::s4> {
+    typedef experimental::precision::s4 element_type;
+    typedef int storage_element_type;
+    typedef int fill_argument_type;
+  };
+  
+  template<> struct helper_traits<experimental::precision::b1> {
+    typedef experimental::precision::b1 element_type;
+    typedef unsigned int storage_element_type;
+    typedef unsigned int fill_argument_type;
+  };
+#endif /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> struct helper_traits<precision::tf32> {
+    typedef precision::tf32 element_type;
+    typedef float storage_element_type;
+    typedef float fill_argument_type;
+  };
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+  // 
+  // The base fragment type
+  // 
+  /* note: alignment required for compiler implementation */
+  template <typename T, int size, int packed_size = size> 
+  struct __align__(8) __frag_base {
+
+    /* Number of elements in the fragment */
+    enum {num_elements = size};
+    
+    /* Number of storage elements in the fragment. 
+
+       The elements of the fragment are packed together when the 
+       fragment element type is experimental::precision::u4, 
+       experimental::precision::s4 or experimental::precision::b1.
+       When elements are packed, num_storage_elements 
+       will be smaller than num_elements.
+    */
+    enum {num_storage_elements = packed_size};
+
+    /* element type of the fragment */
+    typedef T element_type;
+
+    /* element type of the storage representation. 
+    
+       The mapping from element_type to storage_element_type is as follows:
+       experimental::precision::u4 -> unsigned (8 elements in 1 storage element)
+       experimental::precision::s4 -> int (8 elements in 1 storage element)
+       experimental::precision::b1 -> unsigned (32 elements in 1 storage element)
+       precision::tf32             -> float (1 element in 1 storage element)       
+       all other types T           -> T
+    */
+    typedef typename helper_traits<T>::storage_element_type storage_element_type;
+
+    /* Storage for the (possibly packed) fragment elements. */
+    storage_element_type x[num_storage_elements];
+  };
+
+  template <typename FragEleType, typename StorageType, typename ArgType>
+  static inline __device__ StorageType __get_storage_value(ArgType in) { return in; }
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  template<>
+  __device__ inline unsigned 
+  __get_storage_value<experimental::precision::u4, unsigned, unsigned>(unsigned in)
+  {
+    /* For experimental::precision::u4 fragment element type, pack 8 elements into a single 
+       32-bit unsigned int storage element */
+    unsigned val = in & 0xf;
+    return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
+            (val << 20) | (val << 24) | (val << 28));
+  };
+
+  template<>
+  __device__ inline int
+  __get_storage_value<experimental::precision::s4, int, int>(int in)
+  {
+    /* For experimental::precision::s4 fragment element type, pack 8 elements into a single 
+       32-bit signed int storage element */
+    int val = in & 0xf;
+    return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
+            (val << 20) | (val << 24) | (val << 28));
+  };
+  
+  template<>
+  __device__ inline unsigned 
+  __get_storage_value<experimental::precision::b1, unsigned, unsigned>(unsigned in)
+  {
+    /* For experimental::precision::b1 fragment element type, pack 32 elements into a 
+       single 32-bit unsigned int storage element */
+    return (in & 0x1) ? 0xFFFFFFFFU : 0;
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+  template <typename FragEleType, int size, int packed_size>
+    __CUDA_MMA_DEVICE_DECL__ void fill_fragment(__frag_base<FragEleType, size, packed_size>& f, 
+       /*  The mapping from fragment element type (FragEleType) to fill_argument_type is:
+       experimental::precision::u4 -> unsigned (only lower 4 bits taken)
+       experimental::precision::s4 -> int (only lower 4 bits taken)
+       experimental::precision::b1 -> unsigned (only lowest 1 bit taken)
+       precision::tf32             -> float
+       all other types T           -> T
+       */        
+   const typename helper_traits<FragEleType>::fill_argument_type & in) {
+
+   /* get the (possibly packed) storage element value. See the specializations above for fragment
+      element types where the storage representation is packed */
+   typedef typename helper_traits<FragEleType>::storage_element_type storage_type;
+   storage_type v = __get_storage_value<FragEleType, storage_type>(in);
+#pragma unroll
+    for (int i=0; i< f.num_storage_elements; i++)
+      f.x[i] = v; 
+  }
+  
+  // 
+  // Fragment template
+  // 
+  template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;
+
+  // 
+  // Fragments for 16x16x16
+  // 
+  template<> class fragment<matrix_a, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_a, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<accumulator, 16, 16, 16, __half> : public __frag_base<__half, 8> {};
+  template<> class fragment<accumulator, 16, 16, 16, float> : public __frag_base<float, 8> {};
+
+#ifdef __CUDA_IMMA__
+  template<> class fragment<matrix_a, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};  
+  template<> class fragment<matrix_b, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};  
+  template<> class fragment<accumulator, 16, 16, 16, int> : public __frag_base<int, 8> {};
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
+  template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
+  template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+  // 
+  // Fragments for 32x8x16
+  // 
+  template<> class fragment<matrix_a, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<accumulator, 32, 8, 16, __half> : public __frag_base<__half, 8> {};
+  template<> class fragment<accumulator, 32, 8, 16, float> : public __frag_base<float, 8> {};
+
+#ifdef __CUDA_IMMA__
+  template<> class fragment<matrix_a, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<accumulator, 32, 8, 16, int> : public __frag_base<int, 8> {};
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
+  template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
+  template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+  // 
+  // Fragments for 8x32x16
+  // 
+  template<> class fragment<matrix_a, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_a, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
+  template<> class fragment<accumulator, 8, 32, 16, __half> : public __frag_base<__half, 8> {};
+  template<> class fragment<accumulator, 8, 32, 16, float> : public __frag_base<float, 8> {};
+
+#ifdef __CUDA_IMMA__
+  template<> class fragment<matrix_a, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
+  template<> class fragment<matrix_b, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
+  template<> class fragment<accumulator, 8, 32, 16, int> : public __frag_base<int, 8> {};
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
+  template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
+  template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
+#endif  /* __CUDA_AMPERE_MMA__ */  
+  
+#ifdef __CUDA_SUBBYTE_IMMA__
+  // 
+  // Fragments for 8x8x32
+  // 
+  template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
+  template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
+  template<> class fragment<accumulator, 8, 8, 32, int> : public __frag_base<int, 2> {};
+
+  // 
+  // Fragments for 8x8x128
+  // 
+  template<> class fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
+  template<> class fragment<accumulator, 8, 8, 128, int> : public __frag_base<int, 2> {};
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  //
+  // Fragments for 16x16x8
+  //
+  template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
+  template<> class fragment<accumulator, 16, 16, 8, float> : public __frag_base<float, 8> {};
+  
+  //
+  // Fragments for 8x8x4
+  //
+  template<> class fragment<matrix_a, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
+  template<> class fragment<matrix_a, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
+  template<> class fragment<matrix_b, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
+  template<> class fragment<accumulator, 8, 8, 4, double> : public __frag_base<double, 2> {};
+#endif  /* __CUDA_AMPERE_MMA__ */  
+
+  
+  // 
+  // Load functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+  
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  //
+  // Load functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  //
+  // Load functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  //
+  // Load functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+  //
+  // Load functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+
+#ifdef __CUDA_AMPERE_MMA__
+  //
+  // Load functions for frags of shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  
+  //
+  // Load functions for frags of shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // Store functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 16, 16, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 16, 16, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */  
+
+  // 
+  // Store functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+  // 
+  // Store functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  // 
+  // Store functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+  // 
+  // Store functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  //
+  // Store functions for frags of shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+
+  //
+  // Store functions for frags of shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__  
+  // 
+  // MMA functions for shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
+  
+
+  // 
+  // MMA functions for shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
+                                          experimental::bmmaBitOp = experimental::bmmaBitOpXOR, 
+                                          experimental::bmmaAccumulateOp = experimental::bmmaAccumulateOpPOPC) __DEF_IF_HOST
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  // 
+  // MMA functions for shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
+
+  // 
+  // MMA functions for shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
+#endif  /* __CUDA_AMPERE_MMA__ */
+};
+};
+
+#undef __DEF_IF_HOST
+#undef __CUDA_IMMA__
+#undef __CUDA_SUBBYTE_IMMA__
+#undef __CUDA_AMPERE_MMA__
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __CUDA_MMA_DEVICE_DECL__
+
+#if defined(__CUDA_ARCH__)
+#include "mma.hpp"
+#endif /* defined(__CUDA_ARCH__) */
+
+
+#endif /* !__CUDA_MMA_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e10f2a982bd2dcf9814a2fc05a3f200d5a1cb07
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.hpp
@@ -0,0 +1,1128 @@
+/*
+ * Copyright 2017-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/mma.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/mma.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
+#endif
+
+#if !defined(__CUDA_MMA_HPP__)
+#define __CUDA_MMA_HPP__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
+#define __CUDA_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
+#define __CUDA_SUBBYTE_IMMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+#define __CUDA_AMPERE_MMA__ 1
+#endif  /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
+
+namespace nvcuda {
+namespace wmma {
+
+  // 
+  // Load functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
+    else
+      __hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
+    else
+      __hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
+  }
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm)  {
+    __mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm)  {
+    __mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+  // 
+  // Load functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
+    else
+      __hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
+    else
+      __hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
+  }
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */ 
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+  // 
+  // Load functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
+    __hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
+    else
+      __hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
+    else
+      __hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
+  }
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
+    __imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */ 
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
+    __mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+  
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  //
+  // Load functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_a_s4((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_a_u4((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_b_s4((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) {
+      __imma_m8n8k32_ld_b_u4((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+
+  //
+  // Load functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) {
+    __bmma_m8n8k128_ld_a_b1((int *)&a, (const int *)p, ldm, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) {
+    __bmma_m8n8k128_ld_b_b1((int *)&a, (const int *)p, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 0);
+    else
+      __bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 1);
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+
+
+#ifdef __CUDA_AMPERE_MMA__
+  // load functions for frags of shape m16n16k8
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
+    __mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 0);
+    else
+      __mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 1);      
+  }
+  
+  // load functions for frags of shape m8n8k4
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_a((double *)&a, p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_a((double *)&a, p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_b((double *)&a, p, ldm, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
+    __dmma_m8n8k4_ld_b((double *)&a, p, ldm, 1);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __dmma_m8n8k4_ld_c((double *)&a, p, ldm, 0);
+    else
+      __dmma_m8n8k4_ld_c((double *)&a, p, ldm, 1);      
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // Store functions for frags of shape m16n16k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator,16, 16, 16, __half>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
+    else
+      __hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator,16, 16, 16, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
+    else
+      __hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
+  }
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator,16, 16, 16, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+  // 
+  // Store functions for frags of shape m32n8k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
+    else
+      __hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
+    else
+      __hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
+  }
+  
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+  // 
+  // Store functions for frags of shape m8n32k16
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
+    else
+      __hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
+    else
+      __hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
+  }
+
+#ifdef __CUDA_IMMA__
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_SUBBYTE_IMMA__
+  // 
+  // Store functions for frags of shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+
+  // 
+  // Store functions for frags of shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 0);
+    else
+      __bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+
+#ifdef __CUDA_AMPERE_MMA__
+
+  //
+  // Store functions for frags of shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 0);
+    else
+      __mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 1);
+  }
+
+  
+  // 
+  // Store functions for frags of shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) {
+    if (layout == mem_row_major)
+      __dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 0);
+    else
+      __dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 1);
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m16n16k16
+  // 
+  // D fp16, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+      __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
+    __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+  // D fp16, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
+  }
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
+    else
+      __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
+    else
+      __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
+    __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);    
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+  // 
+  // MMA functions for shape m32n8k16
+  // 
+  // D fp16, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
+    __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+  // D fp16, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
+    __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }  
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
+    __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);    
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+  // 
+  // MMA functions for shape m8n32k16
+  // 
+  // D fp16, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp16
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
+    __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
+  }
+
+  // D fp32, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+  // D fp16, C fp32
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
+  }
+    
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
+    __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
+  }
+
+#ifdef __CUDA_IMMA__  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
+    else
+      __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
+  }
+#endif  /* __CUDA_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
+    __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);        
+  }
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+
+#ifdef __CUDA_SUBBYTE_IMMA__  
+  // 
+  // MMA functions for shape m8n8k32
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
+    if (satf)
+      __imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
+    else
+      __imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
+  }
+
+  // 
+  // MMA functions for shape m8n8k128
+  // 
+  __CUDA_MMA_DEVICE_DECL__  void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
+                                           experimental::bmmaBitOp op, experimental::bmmaAccumulateOp)
+  {
+     
+#ifdef __CUDA_AMPERE_MMA__
+    if (op == experimental::bmmaBitOpAND) 
+      __bmma_m8n8k128_mma_and_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
+    else 
+#endif  /* __CUDA_AMPERE_MMA__ */      
+      __bmma_m8n8k128_mma_xor_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
+  }
+
+
+#endif  /* __CUDA_SUBBYTE_IMMA__ */
+
+#ifdef __CUDA_AMPERE_MMA__
+  // 
+  // MMA functions for shape m16n16k8
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);    
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);    
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c)  {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);    
+  }
+
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c)  {
+    __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);    
+  }
+
+  
+  // 
+  // MMA functions for shape m8n8k4
+  // 
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 1, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 3, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 0, 0);
+  }
+  
+  __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
+    __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 2, 0);
+  }
+  
+#endif  /* __CUDA_AMPERE_MMA__ */
+
+};
+};
+
+#undef __CUDA_IMMA__
+#undef __CUDA_SUBBYTE_IMMA__
+#undef __CUDA_MMA_DEVICE_DECL__
+#undef __CUDA_AMPERE_MMA__
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+
+#endif   /* __CUDA_MMA_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/nvfunctional b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/nvfunctional
new file mode 100644
index 0000000000000000000000000000000000000000..5cb9ffeb9cb9f1d202cb1f5cb1d4d7e88a416475
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/nvfunctional
@@ -0,0 +1,621 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead.")
+#else
+#warning "crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
+#endif
+
+#ifndef __NV_LIBCXX_FUNCTIONAL_H__
+#define __NV_LIBCXX_FUNCTIONAL_H__
+
+#if __cplusplus < 201103L 
+  #if defined(_MSC_VER)
+    #if _MSC_VER < 1800
+      #error This library requires VS 2013 and above
+    #endif /* _MSC_VER < 1800 */
+  #else /* !_MSC_VER */
+    #error This library requires support for the ISO C++ 2011 standard
+  #endif /* _MSC_VER */
+#endif /* __cplusplus */
+
+#if defined(_MSC_VER)
+  #define __NV_ALIGNOF __alignof
+  #define __NV_NOEXCEPT
+  #define __NV_CONSTEXPR
+#else /* !_MSC_VER */
+  #define __NV_ALIGNOF alignof
+  #define __NV_NOEXCEPT noexcept
+  #define __NV_CONSTEXPR constexpr
+#endif /* _MSC_VER */
+
+#include <type_traits>
+#include <cstddef>
+#include <new>
+
+// n3290 20.8
+namespace nvstd
+{
+
+namespace internal {
+
+// D.8.1 base (deprecated) [depr.base]
+template <class _Arg, class _Result>
+struct unary_function
+{
+  typedef _Arg argument_type;
+  typedef _Result result_type;
+};
+
+template <class _Arg1, class _Arg2, class _Result>
+struct binary_function
+{
+  typedef _Arg1 first_argument_type;
+  typedef _Arg2 second_argument_type;
+  typedef _Result result_type;
+};
+
+// move
+template <class _T>
+inline __device__ __host__
+typename std::remove_reference<_T>::type&& move(_T&& __t) __NV_NOEXCEPT
+{
+  return static_cast<typename std::remove_reference<_T>::type&&>(__t);
+}
+
+// 20.2.2 swap [utility.swap]
+// swap
+template<class _T, 
+         class = typename std::enable_if<
+                   std::is_move_constructible<_T>::value &&
+                   std::is_move_assignable<_T>::value>::type>
+inline __device__ __host__
+void swap(_T& __a, _T& __b) 
+#if !defined(_MSC_VER)
+noexcept(std::is_nothrow_move_constructible<_T>::value &&
+         std::is_nothrow_move_assignable<_T>::value)
+#endif /* !defined(_MSC_VER) */
+{
+  _T __t(internal::move(__a));
+  __a = internal::move(__b);
+  __b = internal::move(__t);
+}
+
+// 20.2.3 forward/move helpers [forward]
+// forward
+template <class _T> 
+inline __device__ __host__
+_T&& forward(typename std::remove_reference<_T>::type& __t) __NV_NOEXCEPT
+{
+  return static_cast<_T&&>(__t);
+}
+
+template <class _T> 
+inline __device__ __host__
+_T&& forward(typename std::remove_reference<_T>::type&& __t) __NV_NOEXCEPT
+{
+  static_assert(!std::is_lvalue_reference<_T>::value,
+                "Error: __t is instantiated with an lvalue reference type");
+  return static_cast<_T&&>(__t);
+}
+
+} // namespace internal
+
+namespace __functional_helpers
+{
+
+struct __dummy_class;
+
+// Store small functors locally:
+// a functor is legitimate to local storage if it is one of the following types:
+// * member object pointer;
+// * member function pointer;
+// * closure type of size less than or equal to the largest size of 
+//   the above types;
+// * function pointer;
+// * any callable class whose size is less than or equal to
+//   the largest one of the above types;
+union _Small_functor_types 
+{
+  void *__obj;
+  void (*__func_ptr)();
+  void (__dummy_class::*mem_fn_ptr)();
+};
+
+struct _Small_functor_data {
+  char __data[sizeof(_Small_functor_types)];
+};
+
+template <class _RetType, class ..._ArgTypes>
+struct __maybe_base_function
+{ };
+
+template <class _RetType, class _T1>
+struct __maybe_base_function<_RetType(_T1)>
+  : public internal::unary_function<_T1, _RetType>
+{ };
+
+template <class _RetType, class _T1, class _T2>
+struct __maybe_base_function<_RetType(_T1, _T2)>
+  : public internal::binary_function<_T1, _T2, _RetType>
+{ };
+
+} // namespace __functional_helpers
+
+// 20.8.11 Polymorphic function wrappers [func.wrap]
+
+// 20.8.11.1 Class bad_function_call [func.wrap.badcall]
+// unimplemented because of exception
+// class bad_function_call : public std::exception
+
+// 20.8.11.2 Class template function [func.wrap.func]
+
+template<class> class function; // undefined
+
+// Simplified version of template class function, which
+//   * does not support allocator_arg_t;
+//   * does not support target and target_type that rely on RTTI
+//   * does not throw bad_function_call exception on invoking a NULL target
+template <class _RetType, class ..._ArgTypes>
+class function<_RetType(_ArgTypes...)> 
+  : public __functional_helpers::__maybe_base_function<_RetType(_ArgTypes...)>
+{
+  __functional_helpers::_Small_functor_data __small_functor_data;
+  void *__obj;
+  typedef _RetType(*__meta_fn_type)(void *, _ArgTypes...);
+  __meta_fn_type __meta_fn;
+  typedef void(*__cloner_type)(function &, const function &);
+  __cloner_type __cloner;
+  typedef void(*__destructor_type)(function *);
+  __destructor_type __destructor;
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  __NV_CONSTEXPR bool __use_small_functor_data() const
+  {
+    return (sizeof(_F) <= sizeof(__small_functor_data) &&
+            __NV_ALIGNOF(_F) <= __NV_ALIGNOF(
+                                  __functional_helpers::_Small_functor_types));
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  void* __get_small_functor_data() const
+  {
+    return (void*)(&__small_functor_data.__data[0]);
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  bool __is_small_functor_data() const
+  {
+    return __obj == __get_small_functor_data();
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static _F& __get_functor(void *__p)
+  {
+    return *((_F*)__p);
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static bool __is_empty_functor(const _F& /*__p*/)
+  {
+    return false;
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static bool __is_empty_functor(const _F* __p)
+  {
+    return !__p;
+  }
+  
+  #pragma nv_exec_check_disable
+  template <class _Res, class _C>
+  __device__ __host__
+  static bool __is_empty_functor(const _Res _C::* __p)
+  {
+    return !__p;
+  }
+ 
+  #pragma nv_exec_check_disable
+  template <class _Res, class... _Args>
+  __device__ __host__
+  static bool __is_empty_functor(const function<_Res(_Args...)>& __p)
+  {
+    return !__p;
+  }
+  
+  template <class _F>
+  struct __make_cloner
+  {
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static void __clone_data(function &__dest, const function &__src)
+    {
+      if (__dest.__use_small_functor_data<_F>()) {
+        __dest.__obj = __dest.__get_small_functor_data();
+        new (__dest.__obj) _F(__src.__get_functor<_F>(__src.__obj));
+      }
+      else {
+        __dest.__obj = new _F(__src.__get_functor<_F>(__src.__obj));
+      }
+    }
+  };
+
+  template <class _F>
+  struct __make_destructor
+  {
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static void __destruct(function *__fn)
+    {
+      if (__fn->__use_small_functor_data<_F>()) {
+        (__fn->__get_functor<_F>(__fn->__obj)).~_F();
+      }
+      else {
+        delete (_F*)(__fn->__obj);
+      }
+    }
+  };
+
+  // We cannot simple define __make_functor in the following way:
+  // template <class _T, _F>
+  // __make_functor;
+  // template <class _RetType1, class _F, class... _ArgTypes1>
+  // struct __make_functor<_RetType1(_ArgTypes1...), _F> 
+  //
+  // because VS 2013 cannot unpack _RetType1(_ArgTypes1...)
+  template <class _RetType1, class _F, class... _ArgTypes1>
+  struct __make_functor
+  {
+    typedef _RetType1 type;
+
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
+    {
+      return __get_functor<_F>(__d)(
+               internal::forward<_ArgTypes1>(__args)...);
+    }
+  };
+
+  template <class _RetType1, class _C, class _M, class... _ArgTypes1>
+  struct __make_functor<_RetType1, _M _C::*,_ArgTypes1...>
+  {
+    typedef _RetType1 type;
+    typedef _RetType1(*_Fn)(_ArgTypes1...);
+
+    #pragma nv_exec_check_disable    
+    __device__ __host__
+    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
+    {
+      return __get_functor<_Fn>(__d)(
+               internal::forward<_ArgTypes1>(__args)...);
+    }
+  };
+
+// workaround for GCC version below 4.8
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
+  template <class _F>
+  struct __check_callability
+    : public std::integral_constant<bool, 
+                                    !std::is_same<_F, std::nullptr_t>::value>
+  { };
+#elif defined(_MSC_VER)
+  // simulate VC 2013's behavior...
+  template <class _F>
+  struct __check_callability1
+    : public 
+        std::integral_constant<bool, 
+          // std::result_of does not handle member pointers well 
+          std::is_member_pointer<_F>::value ||
+          std::is_convertible<
+            _RetType,
+            typename std::result_of<_F(_ArgTypes...)>::type
+          >::value
+        >
+  { };
+
+  template <class _F>
+  struct __check_callability
+    : public std::integral_constant<
+               bool,
+               !std::is_same<_F, function>::value && 
+               __check_callability1<typename std::remove_cv<_F>::type>::value>
+  { };
+#else /* !((__GNUC__ == 4) && (__GNUC_MINOR__ < 8)) _MSC_VER */
+  template <class _F,
+            class _T = typename std::result_of<_F(_ArgTypes...)>::type>
+  struct __check_callability
+    : public std::integral_constant<
+               bool,
+               !std::is_same<_F, function>::value && 
+                 std::is_convertible< _T, _RetType>::value>
+  { };
+#endif /* __GNUC__ == 4) && (__GNUC_MINOR__ < 8) */
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  void __destroy()
+  {
+    if (__obj) {
+      __destructor(this);
+      __obj = 0;
+    }
+  }
+  
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  void __clear()
+  {
+    __obj = 0;
+    __meta_fn = 0;
+    __cloner = 0;
+    __destructor = 0;
+  }
+
+public:
+  typedef _RetType result_type;
+
+/* 
+ * These typedef(s) are derived from __maybe_base_function
+ * typedef T1 argument_type;        // only if sizeof...(ArgTypes) == 1 and
+ *                                  // the type in ArgTypes is T1
+ * typedef T1 first_argument_type;  // only if sizeof...(ArgTypes) == 2 and
+ *                                  // ArgTypes contains T1 and T2
+ * typedef T2 second_argument_type; // only if sizeof...(ArgTypes) == 2 and
+ *                                  // ArgTypes contains T1 and T2
+ */
+
+  // 20.8.11.2.1 construct/copy/destroy [func.wrap.con]
+  
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function() __NV_NOEXCEPT
+    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(std::nullptr_t) __NV_NOEXCEPT
+    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(const function &__fn)
+  {
+    if (__fn.__obj == 0) {
+      __clear();
+    }
+    else {
+      __meta_fn = __fn.__meta_fn;
+      __destructor = __fn.__destructor;
+      __fn.__cloner(*this, __fn);
+      __cloner = __fn.__cloner;
+    }
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(function &&__fn)
+  {
+    __fn.swap(*this);
+  }
+
+  // VS 2013 cannot process __check_callability type trait.
+  // So, we check callability using static_assert instead of
+  // using SFINAE such as
+  // template<class _F, 
+  //          class = typename std::enable_if<
+  //                    __check_callability<_F>::value
+  //         >::type>
+  
+  #pragma nv_exec_check_disable   
+  template<class _F>
+  __device__ __host__ 
+  function(_F);
+
+  // copy and swap
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  function& operator=(const function& __fn)
+  {
+    function(__fn).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  function& operator=(function&& __fn)
+  {
+    function(internal::move(__fn)).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  function& operator=(std::nullptr_t)
+  {
+    __destroy();
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable
+  template<class _F>
+  __device__ __host__
+  function&
+  operator=(_F&& __fn) 
+  {
+    static_assert(__check_callability<_F>::value,
+                  "Unable to create functor object!");
+    function(internal::forward<_F>(__fn)).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  ~function()
+  {
+    __destroy();
+  }
+
+  // 20.8.11.2.2 function modifiers [func.wrap.func.mod]
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  void swap(function& __fn) __NV_NOEXCEPT
+  {
+    internal::swap(__meta_fn, __fn.__meta_fn);
+    internal::swap(__cloner, __fn.__cloner);
+    internal::swap(__destructor, __fn.__destructor);
+
+    if (__is_small_functor_data() && __fn.__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+    }
+    else if (__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+      internal::swap(__obj, __fn.__obj);
+      __fn.__obj = __fn.__get_small_functor_data();
+    }
+    else if (__fn.__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+      internal::swap(__obj, __fn.__obj);
+      __obj = __get_small_functor_data();
+    }
+    else {
+      internal::swap(__obj, __fn.__obj);
+    }
+  }
+
+  // 20.8.11.2.3 function capacity [func.wrap.func.cap]
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  explicit operator bool() const __NV_NOEXCEPT
+  {
+    return __obj;
+  }
+
+  // 20.8.11.2.4 function invocation [func.wrap.func.inv]
+  // function::operator() can only be called in device code
+  // to avoid cross-execution space calls
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  _RetType operator()(_ArgTypes...) const;
+
+};
+
+// Out-of-line definitions
+#pragma nv_exec_check_disable
+template<class _RetType, class... _ArgTypes>
+template<class _F>
+__device__ __host__
+function<_RetType(_ArgTypes...)>::function(_F __fn)
+  : __obj(0), __meta_fn(0), __cloner(0), __destructor(0)
+{
+  static_assert(__check_callability<_F>::value,
+                "Unable to construct functor object!");
+  if (__is_empty_functor(__fn))
+    return;
+  __meta_fn = &__make_functor<_RetType, _F, _ArgTypes...>::__invoke;
+  __cloner = &__make_cloner<_F>::__clone_data;
+  __destructor = &__make_destructor<_F>::__destruct;
+
+  if (__use_small_functor_data<_F>()) {
+    __obj = __get_small_functor_data();
+    new ((void*)__obj) _F(internal::move(__fn));
+  }
+  else {
+    __obj = new _F(internal::move(__fn));
+  }
+}
+
+#pragma nv_exec_check_disable 
+template <class _RetType, class..._ArgTypes>
+__device__ __host__
+_RetType
+function<_RetType(_ArgTypes...)>::operator()(_ArgTypes... __args) const
+{
+  return __meta_fn(__obj, internal::forward<_ArgTypes>(__args)...);
+}
+
+// 20.8.11.2.6, Null pointer comparisons:
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator==(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t) 
+__NV_NOEXCEPT
+{
+  return !__fn;
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator==(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
+__NV_NOEXCEPT
+{
+  return !__fn;
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator!=(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
+__NV_NOEXCEPT
+{
+  return static_cast<bool>(__fn);
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator!=(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
+__NV_NOEXCEPT
+{
+  return static_cast<bool>(__fn);
+}
+
+// 20.8.11.2.7, specialized algorithms:
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+void swap(function<_R(_ArgTypes...)>& __fn1, function<_R(_ArgTypes...)>& __fn2)
+{
+  __fn1.swap(__fn2);
+}
+
+} // namespace nvstd
+
+#undef __NV_NOEXCEPT
+#undef __NV_CONSTEXPR
+#undef __NV_ALIGNOF
+
+#endif // __NV_LIBCXX_FUNCTIONAL_H__
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..6046953afa8c5f71cf7058436de10397d6353e9e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2017-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
+#endif
+
+#if !defined(__SM_70_RT_H__)
+#define __SM_70_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_70_RT_DECL__ __host__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_70_RT_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_70_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/******************************************************************************
+ *                                   match                                   *
+ ******************************************************************************/
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_70_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_70_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_70_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
+#endif
+
+
+#undef EXCLUDE_FROM_RTC
\ No newline at end of file
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..322496587325a1387e4280a509455e3ccc7caa1b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
+#endif
+
+#if !defined(__SM_70_RT_HPP__)
+#define __SM_70_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_70_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_70_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-7.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+//
+// __match_any_sync
+//
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) {
+  return __match32_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) {
+  return __match32_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_any_sync(mask, (unsigned long long)value):
+    __match32_any_sync(mask, (unsigned)value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_any_sync(mask, (unsigned long long)value):
+    __match32_any_sync(mask, (unsigned)value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) {
+  return __match64_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) {
+  return __match64_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) {
+  return __match32_any_sync(mask, __float_as_uint(value));
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) {
+  return __match64_any_sync(mask, __double_as_longlong(value));
+}
+
+//
+// __match_all_sync
+//
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) {
+  return __match32_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) {
+  return __match32_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_all_sync(mask, (unsigned long long)value, pred):
+    __match32_all_sync(mask, (unsigned)value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_all_sync(mask, (unsigned long long)value, pred):
+    __match32_all_sync(mask, (unsigned)value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) {
+  return __match64_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) {
+  return __match64_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) {
+  return __match32_all_sync(mask, __float_as_uint(value), pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) {
+  return __match64_all_sync(mask, __double_as_longlong(value), pred);
+}
+
+__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) {
+    asm volatile("nanosleep.u32 %0;" :: "r"(ns));
+}
+
+
+extern "C" __device__ __device_builtin__
+unsigned short __usAtomicCAS(unsigned short *, unsigned short, unsigned short);
+
+__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) {
+  return __usAtomicCAS(address, compare, val);
+}
+
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_70_RT_DECL__
+
+#endif /* !__SM_70_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc18290966875591b6a6efa1f8564eb76e5aa34b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_80_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_80_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
+#endif
+
+#if !defined(__SM_80_RT_H__)
+#define __SM_80_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_80_RT_DECL__ __host__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_80_RT_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_80_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+/******************************************************************************
+ *                                   reduce                                   *
+ ******************************************************************************/
+__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+
+__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) __DEF_IF_HOST
+__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) __DEF_IF_HOST
+__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) __DEF_IF_HOST
+
+__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+
+#undef EXCLUDE_FROM_RTC
+
+
+extern "C" {
+inline __device__ void *__nv_associate_access_property(const void *ptr, 
+                                                       unsigned long long property) {
+  extern __device__ void *__nv_associate_access_property_impl(const void *,
+                                                              unsigned long long);
+  return __nv_associate_access_property_impl(ptr, property);
+}
+
+inline __device__  void __nv_memcpy_async_shared_global_4(void *dst, 
+                                                          const void *src, 
+                                                          unsigned src_size) {
+  extern __device__ void __nv_memcpy_async_shared_global_4_impl(void *, 
+                                                                const void *, 
+                                                                unsigned);
+  __nv_memcpy_async_shared_global_4_impl(dst, src, src_size);
+}
+
+inline __device__  void __nv_memcpy_async_shared_global_8(void *dst, 
+                                                          const void *src, 
+                                                          unsigned src_size) {
+  extern __device__ void __nv_memcpy_async_shared_global_8_impl(void *, 
+                                                                const void *, 
+                                                                unsigned);
+  __nv_memcpy_async_shared_global_8_impl(dst, src, src_size);
+}
+
+inline __device__  void __nv_memcpy_async_shared_global_16(void *dst, 
+                                                          const void *src, 
+                                                          unsigned src_size) {
+  extern __device__ void __nv_memcpy_async_shared_global_16_impl(void *, 
+                                                                const void *, 
+                                                                unsigned);
+  __nv_memcpy_async_shared_global_16_impl(dst, src, src_size);
+}
+
+}
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_80_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_80_rt.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_80_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..857bd44a3bb0d8480560047a85f9059bc370b52f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_80_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_80_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
+#endif
+
+#if !defined(__SM_80_RT_HPP__)
+#define __SM_80_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_80_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_80_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-8.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+extern "C" { 
+  __device_builtin__ __device__ unsigned __reduce_add_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_min_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_max_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ int __reduce_add_sync_signed_impl(unsigned, int);
+  __device_builtin__ __device__ int __reduce_min_sync_signed_impl(unsigned, int);
+  __device_builtin__ __device__ int __reduce_max_sync_signed_impl(unsigned, int);
+  __device_builtin__ __device__ unsigned __reduce_or_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_and_sync_unsigned_impl(unsigned, unsigned);
+  __device_builtin__ __device__ unsigned __reduce_xor_sync_unsigned_impl(unsigned, unsigned);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) {
+  return __reduce_add_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) {
+  return __reduce_min_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) {
+  return __reduce_max_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) {
+  return __reduce_add_sync_signed_impl(mask, value);
+}
+  
+__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) {
+  return __reduce_min_sync_signed_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) {
+  return __reduce_max_sync_signed_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) {
+  return __reduce_and_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) {
+  return __reduce_or_sync_unsigned_impl(mask, value);
+}
+
+__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) {
+  return __reduce_xor_sync_unsigned_impl(mask, value);
+}
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_80_RT_DECL__
+
+#endif /* !__SM_80_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e250634fe76651c2a15b5b492378efec1d3e0c5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2022-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_90_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_90_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
+#endif
+
+#if !defined(__SM_90_RT_H__)
+#define __SM_90_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_90_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_90_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+__SM_90_RT_DECL__ unsigned __isCtaShared(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, unsigned target_block_rank)  __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, unsigned cluster_cta_mask) __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterDim() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterIdx() __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank() __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_arrive() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_wait() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __threadfence_cluster() __DEF_IF_HOST
+
+__SM_90_RT_DECL__ float2 atomicAdd(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd(float4 *__address, float4 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *__address, float4 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *__address, float4 val) __DEF_IF_HOST
+
+#undef EXCLUDE_FROM_RTC
+
+//Note: below atomic functions are templates, so cannot be represented in NVRTC
+//builtins representation, so they have to be parsed on every NVRTC compilation.
+//(notice 'EXCLUDE_FROM_RTC' ends above)
+
+
+#ifndef __NV_DISABLE_128_ATOMICS
+// lgen definitions for 128b atomics
+extern "C" {
+  __device__ __device_builtin__ void __u128AtomicCAS(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicCAS_block(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicCAS_system(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch(void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch_block(void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch_system(void *, void *, void *);
+}
+
+// macro to get address of object, to workaround situations where the type overloads the "&" operator
+#define __NV_ATOMIC_ADDRESSOF(__val) \
+        (void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(__val))))
+
+// enable_if
+template<bool __b, typename _T>
+struct __nv_atomic_enable_if { };
+
+template<typename _T>
+struct __nv_atomic_enable_if<true, _T> { typedef _T __type; };
+
+// alignof
+#if defined(__CUDACC_RTC__)
+#define __NV_ATOMIC_ALIGNOF __alignof__
+#else
+#define __NV_ATOMIC_ALIGNOF __alignof
+#endif
+
+// trivially copyable
+template <typename _T>
+struct __nv_atomic_triv_cp_helper {
+#if defined(__GNUC__)
+#if  (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+  static const bool __val = true;
+#elif (__GNUC__ < 5)
+  static const bool __val = __has_trivial_copy(_T);
+#else
+  static const bool __val = __is_trivially_copyable(_T);
+#endif
+#else
+  static const bool __val = __is_trivially_copyable(_T);
+#endif
+};
+#define __NV_ATOMIC_TRIVIALLY_COPYABLE(_T) \
+        __nv_atomic_triv_cp_helper<_T>::__val
+
+// return type
+#if __cplusplus >= 202002L // C++20 or greater
+#define __NV_ATOMIC_RET_TYPE(_T) _T
+#else
+#define __NV_ATOMIC_RET_TYPE(_T) typename \
+  __nv_atomic_enable_if<sizeof(_T) == 16 && \
+  __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
+  __NV_ATOMIC_TRIVIALLY_COPYABLE(_T), _T>::__type
+#endif
+
+// requires
+#if __cplusplus >= 202002L // C++20 or greater
+#define __NV_ATOMIC_REQUIRES(_T) \
+  requires(sizeof(_T) == 16 && \
+  __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
+  __NV_ATOMIC_TRIVIALLY_COPYABLE(_T))
+#else
+#define __NV_ATOMIC_REQUIRES(_T)
+#endif
+
+// temp value and return value
+#if __cplusplus >= 201103L || defined(_MSC_VER) // C++11 or greater, or MSC
+#define __NV_ATOMIC_TEMP(_T) union _U \
+  {_T __ret; __device__ __inline__ _U() {}}; _U __u
+#define __NV_ATOMIC_RET(_T) __u.__ret
+#else
+#define __NV_ATOMIC_TEMP(_T) _T __ret
+#define __NV_ATOMIC_RET(_T) __ret
+#endif
+
+// templated 128-bit atomics
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS_block(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS_block((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS_system(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS_system((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch_block(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch_block((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch_system(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch_system((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+#endif /* !__NV_DISABLE_128_ATOMICS */
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_90_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_90_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_90_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
+#endif
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e61ac78b996fa03cadf60208bbd58f2e781f3ec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_90_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_90_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
+#endif
+
+#if !defined(__SM_90_RT_HPP__)
+#define __SM_90_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_90_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_90_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-9.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+extern "C" {
+  __device__ unsigned  __nv_isClusterShared_impl(const void *);
+  __device__ void * __nv_cluster_map_shared_rank_impl(const void *, unsigned);
+  __device__ unsigned __nv_cluster_query_shared_rank_impl(const void *);
+  __device__ unsigned __nv_clusterDimIsSpecifed_impl();
+  __device__ void __nv_clusterDim_impl(unsigned *, unsigned *, unsigned *);
+  __device__ void __nv_clusterRelativeBlockIdx_impl(unsigned *, 
+                                                    unsigned *, unsigned *);
+  __device__ void __nv_clusterGridDimInClusters_impl(unsigned *, 
+                                                     unsigned *, unsigned *);
+  __device__ void __nv_clusterIdx_impl(unsigned *, unsigned *, unsigned *);
+  __device__ unsigned __nv_clusterRelativeBlockRank_impl();
+  __device__ unsigned __nv_clusterSizeInBlocks_impl();
+  __device__ void __nv_cluster_barrier_arrive_impl();
+  __device__ void __nv_cluster_barrier_arrive_relaxed_impl();
+  __device__ void __nv_cluster_barrier_wait_impl();
+  __device__ void __nv_threadfence_cluster_impl();
+
+  __device__ __device_builtin__ float2 __f2AtomicAdd(float2 *, float2);
+  __device__ __device_builtin__ float2 __f2AtomicAdd_block(float2 *, float2);
+  __device__ __device_builtin__ float2 __f2AtomicAdd_system(float2 *, float2);
+  __device__ __device_builtin__ float4 __f4AtomicAdd(float4 *, float4);
+  __device__ __device_builtin__ float4 __f4AtomicAdd_block(float4 *, float4);
+  __device__ __device_builtin__ float4 __f4AtomicAdd_system(float4 *, float4);
+} // extern "C"
+
+__SM_90_RT_DECL__  unsigned __isCtaShared(const void *ptr) 
+{
+  return __isShared(ptr);
+}
+
+__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) 
+{
+  return __nv_isClusterShared_impl(ptr);
+}
+
+__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, 
+                                                  unsigned target_block_rank)
+{
+  return __nv_cluster_map_shared_rank_impl(ptr, target_block_rank);
+}
+
+__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr)
+{
+  return __nv_cluster_query_shared_rank_impl(ptr);
+}
+
+__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, 
+                                                 unsigned int cluster_cta_mask)
+{
+  return make_uint2((unsigned)__cvta_generic_to_shared(ptr), cluster_cta_mask);
+}
+
+__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified()
+{
+  return __nv_clusterDimIsSpecifed_impl();
+}  
+
+__SM_90_RT_DECL__ dim3 __clusterDim()
+{
+  unsigned x, y, z;
+  __nv_clusterDim_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx()
+{
+  unsigned x, y, z;
+  __nv_clusterRelativeBlockIdx_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters()
+{
+  unsigned x, y, z;
+  __nv_clusterGridDimInClusters_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterIdx()
+{
+  unsigned x, y, z;
+  __nv_clusterIdx_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank()
+{
+  return __nv_clusterRelativeBlockRank_impl();
+}
+
+__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks()
+{
+  return __nv_clusterSizeInBlocks_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_arrive()
+{
+  __nv_cluster_barrier_arrive_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed()
+{
+  __nv_cluster_barrier_arrive_relaxed_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_wait()
+{
+  __nv_cluster_barrier_wait_impl();
+}
+
+__SM_90_RT_DECL__ void __threadfence_cluster()
+{
+  __nv_threadfence_cluster_impl();
+}
+
+
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__SM_90_RT_DECL__ float2 atomicAdd(float2 *address, float2 val) {
+  return __f2AtomicAdd(address, val);
+}
+
+__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *address, float2 val) {
+  return __f2AtomicAdd_block(address, val);
+}
+
+__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *address, float2 val) {
+  return __f2AtomicAdd_system(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd(float4 *address, float4 val) {
+  return __f4AtomicAdd(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *address, float4 val) {
+  return __f4AtomicAdd_block(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *address, float4 val) {
+  return __f4AtomicAdd_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_90_RT_DECL__
+
+#endif /* !__SM_90_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/storage_class.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/storage_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fb19bd46ebde4a53dfad866050fad9fb0cbd222
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/storage_class.h
@@ -0,0 +1,142 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
+#endif
+
+#if !defined(__STORAGE_CLASS_H__)
+#define __STORAGE_CLASS_H__
+
+#if !defined(__var_used__)
+
+#define __var_used__
+
+#endif /* __var_used__ */
+
+#if !defined(__loc_sc__)
+
+#define __loc_sc__(loc, size, sc) \
+        __storage##_##sc##size##loc loc
+
+#endif /* !__loc_sc__ */
+
+#if !defined(__storage___device__)
+#define __storage___device__ static __var_used__
+#endif /* __storage___device__ */
+
+#if !defined(__storage_extern__device__)
+#define __storage_extern__device__ static __var_used__
+#endif /* __storage_extern__device__ */
+
+#if !defined(__storage_auto__device__)
+#define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__device__ */
+
+#if !defined(__storage_static__device__)
+#define __storage_static__device__ static __var_used__
+#endif /* __storage_static__device__ */
+
+#if !defined(__storage___constant__)
+#define __storage___constant__ static __var_used__
+#endif /* __storage___constant__ */
+
+#if !defined(__storage_extern__constant__)
+#define __storage_extern__constant__ static __var_used__
+#endif /* __storage_extern__constant__ */
+
+#if !defined(__storage_auto__constant__)
+#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__constant__ */
+
+#if !defined(__storage_static__constant__)
+#define __storage_static__constant__ static __var_used__
+#endif /* __storage_static__constant__ */
+
+#if !defined(__storage___shared__)
+#define __storage___shared__ static __var_used__
+#endif /* __storage___shared__ */
+
+#if !defined(__storage_extern__shared__)
+#define __storage_extern__shared__ static __var_used__
+#endif /* __storage_extern__shared__ */
+
+#if !defined(__storage_auto__shared__)
+#define __storage_auto__shared__ static
+#endif /* __storage_auto__shared__ */
+
+#if !defined(__storage_static__shared__)
+#define __storage_static__shared__ static __var_used__
+#endif /* __storage_static__shared__ */
+
+#if !defined(__storage__unsized__shared__)
+#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage__unsized__shared__ */
+
+#if !defined(__storage_extern_unsized__shared__)
+#define __storage_extern_unsized__shared__ static __var_used__
+#endif /* __storage_extern_unsized__shared__ */
+
+#if !defined(__storage_auto_unsized__shared__)
+#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto_unsized__shared__ */
+
+#if !defined(__storage_static_unsized__shared__)
+#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_static_unsized__shared__ */
+
+#if !defined(__storage___text__)
+#define __storage___text__ static __var_used__
+#endif /* __storage___text__ */
+
+#if !defined(__storage_extern__text__)
+#define __storage_extern__text__ static __var_used__
+#endif /* __storage_extern__text__ */
+
+#if !defined(__storage_auto__text__)
+#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__text__ */
+
+#if !defined(__storage_static__text__)
+#define __storage_static__text__ static __var_used__
+#endif /* __storage_static__text__ */
+
+#if !defined(__storage___surf__)
+#define __storage___surf__ static __var_used__
+#endif /* __storage___surf__ */
+
+#if !defined(__storage_extern__surf__)
+#define __storage_extern__surf__ static __var_used__
+#endif /* __storage_extern__surf__ */
+
+#if !defined(__storage_auto__surf__)
+#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__surf__ */
+
+#if !defined(__storage_static__surf__)
+#define __storage_static__surf__ static __var_used__
+#endif /* __storage_static__surf__ */
+
+#endif /* !__STORAGE_CLASS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGL.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGL.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c70e881774c8f3cf8b6430e7aa53a98d74669
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGL.h
@@ -0,0 +1,608 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAGL_H
+#define CUDAGL_H
+
+#include <cuda.h>
+#include <GL/gl.h>
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#define cuGLCtxCreate            cuGLCtxCreate_v2
+#define cuGLMapBufferObject      __CUDA_API_PTDS(cuGLMapBufferObject_v2)
+#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
+#define cuGLGetDevices           cuGLGetDevices_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_GL OpenGL Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface. Note that mapping 
+ * of OpenGL resources is performed with the graphics API agnostic, resource 
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
+ *
+ * @{
+ */
+
+#if defined(_WIN32)
+#if !defined(WGL_NV_gpu_affinity)
+typedef void* HGPUNV;
+#endif
+#endif /* _WIN32 */
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * pCudaResource.  The register flags \p Flags specify the intended usage,
+ * as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param buffer - name of buffer object to be registered
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa 
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsGLRegisterBuffer
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.  
+ * A handle to the registered object is returned as \p pCudaResource.  
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p Flags specify the intended usage, as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param image - name of texture or renderbuffer object to be registered
+ * \param target - Identifies the type of object specified by \p image
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa 
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsGLRegisterImage
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+
+#ifdef _WIN32
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
+ * applicable.
+ *
+ * \param pDevice - Device associated with hGpu
+ * \param hGpu    - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
+ * ::cuGLSetBufferObjectMapFlags,
+ * ::cudaWGLGetDevice
+ */
+CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
+#endif /* _WIN32 */
+
+/**
+ * CUDA devices corresponding to an OpenGL device
+ */
+typedef enum CUGLDeviceList_enum {
+    CU_GL_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+    CU_GL_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+    CU_GL_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
+} CUGLDeviceList;
+
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
+ * at most cudaDeviceCount of the CUDA-compatible devices corresponding to 
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
+ *
+ * The \p deviceList argument may be any of the following:
+ * - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
+ * - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the current frame (in SLI).
+ * - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
+ *   this is correct in all cases.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices.
+ * \param pCudaDevices     - Returned CUDA devices.
+ * \param cudaDeviceCount  - The size of the output device array pCudaDevices.
+ * \param deviceList       - The set of devices to return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ *
+ * \notefnerr
+ *
+ * \sa
+ * ::cuWGLGetDevice,
+ * ::cudaGLGetDevices
+ */
+CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+
+/**
+ * \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+
+/** Flags to map or unmap a resource */
+typedef enum CUGLmap_flags_enum {
+    CU_GL_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,    
+} CUGLmap_flags;
+
+/**
+ * \brief Create a CUDA context for interoperability with OpenGL
+ *
+ * \deprecated This function is deprecated as of Cuda 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx   - Returned CUDA context
+ * \param Flags  - Options for CUDA context creation
+ * \param device - Device on which to create the context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+
+/**
+ * \brief Initializes OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Initializes OpenGL interoperability. This function is deprecated
+ * and calling it is no longer required. It may fail if the needed
+ * OpenGL driver facilities are not available.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  There must be a valid OpenGL context bound to the current
+ * thread when this function is called, and the buffer name is
+ * resolved by that context.
+ *
+ * \param buffer - The name of the buffer object to register.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsGLRegisterBuffer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
+
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param dptr   - Returned mapped base pointer
+ * \param size   - Returned size of mapping
+ * \param buffer - The name of the buffer object to map
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size,  GLuint buffer);  
+
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param buffer - Buffer object to unmap
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
+
+/**
+ * \brief Unregister an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unregisters the buffer object specified by \p buffer.  This
+ * releases any resources associated with the registered buffer.
+ * After this call, the buffer may no longer be mapped for access by
+ * CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Name of the buffer object to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnregisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
+
+/**
+ * \brief Set the map flags for an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Sets the map flags for the buffer object specified by \p buffer.
+ *
+ * Changes to \p Flags will take effect the next time \p buffer is mapped.
+ * The \p Flags argument may be any of the following:
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels. This is the default value.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p buffer has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
+ * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Buffer object to unmap
+ * \param Flags  - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceSetMapFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
+
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param dptr    - Returned mapped base pointer
+ * \param size    - Returned size of mapping
+ * \param buffer  - The name of the buffer object to map
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param buffer  - Name of the buffer object to unmap
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
+
+/** @} */ /* END CUDA_GL_DEPRECATED */
+/** @} */ /* END CUDA_GL */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuGLCtxCreate
+    #undef cuGLMapBufferObject
+    #undef cuGLMapBufferObjectAsync
+    #undef cuGLGetDevices
+
+    CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+    CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+    CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+    CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer, CUstream hStream);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..81f0d5349e435159647af9af379d1e8e8441221c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAGLTYPEDEFS_H
+#define CUDAGLTYPEDEFS_H
+
+// Dependent includes for cudagl.h
+#include <GL/gl.h>
+
+#include <cudaGL.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaGL.h
+ */
+#define PFN_cuGraphicsGLRegisterBuffer  PFN_cuGraphicsGLRegisterBuffer_v3000
+#define PFN_cuGraphicsGLRegisterImage  PFN_cuGraphicsGLRegisterImage_v3000
+#define PFN_cuWGLGetDevice  PFN_cuWGLGetDevice_v2020
+#define PFN_cuGLGetDevices  PFN_cuGLGetDevices_v6050
+#define PFN_cuGLCtxCreate  PFN_cuGLCtxCreate_v3020
+#define PFN_cuGLInit  PFN_cuGLInit_v2000
+#define PFN_cuGLRegisterBufferObject  PFN_cuGLRegisterBufferObject_v2000
+#define PFN_cuGLMapBufferObject  __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
+#define PFN_cuGLUnmapBufferObject  PFN_cuGLUnmapBufferObject_v2000
+#define PFN_cuGLUnregisterBufferObject  PFN_cuGLUnregisterBufferObject_v2000
+#define PFN_cuGLSetBufferObjectMapFlags  PFN_cuGLSetBufferObjectMapFlags_v2030
+#define PFN_cuGLMapBufferObjectAsync  __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
+#define PFN_cuGLUnmapBufferObjectAsync  PFN_cuGLUnmapBufferObjectAsync_v2030
+
+
+/**
+ * Type definitions for functions defined in cudaGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+#ifdef _WIN32
+typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
+#endif
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaTypedefs.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..18fd7a559a81e59e4533ed175b1815209897e9ea
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaTypedefs.h
@@ -0,0 +1,1102 @@
+/*
+ * Copyright 2020-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDATYPEDEFS_H
+#define CUDATYPEDEFS_H
+
+#include <cuda.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cuda.h
+ */
+#define PFN_cuGetErrorString  PFN_cuGetErrorString_v6000
+#define PFN_cuGetErrorName  PFN_cuGetErrorName_v6000
+#define PFN_cuInit  PFN_cuInit_v2000
+#define PFN_cuDriverGetVersion  PFN_cuDriverGetVersion_v2020
+#define PFN_cuDeviceGet  PFN_cuDeviceGet_v2000
+#define PFN_cuDeviceGetCount  PFN_cuDeviceGetCount_v2000
+#define PFN_cuDeviceGetName  PFN_cuDeviceGetName_v2000
+#define PFN_cuDeviceGetUuid  PFN_cuDeviceGetUuid_v11040
+#define PFN_cuDeviceGetLuid  PFN_cuDeviceGetLuid_v10000
+#define PFN_cuDeviceTotalMem  PFN_cuDeviceTotalMem_v3020
+#define PFN_cuDeviceGetTexture1DLinearMaxWidth  PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
+#define PFN_cuDeviceGetAttribute  PFN_cuDeviceGetAttribute_v2000
+#define PFN_cuDeviceGetNvSciSyncAttributes  PFN_cuDeviceGetNvSciSyncAttributes_v10020
+#define PFN_cuDeviceSetMemPool  PFN_cuDeviceSetMemPool_v11020
+#define PFN_cuDeviceGetMemPool  PFN_cuDeviceGetMemPool_v11020
+#define PFN_cuDeviceGetDefaultMemPool  PFN_cuDeviceGetDefaultMemPool_v11020
+#define PFN_cuDeviceGetProperties  PFN_cuDeviceGetProperties_v2000
+#define PFN_cuDeviceComputeCapability  PFN_cuDeviceComputeCapability_v2000
+#define PFN_cuDevicePrimaryCtxRetain  PFN_cuDevicePrimaryCtxRetain_v7000
+#define PFN_cuDevicePrimaryCtxRelease  PFN_cuDevicePrimaryCtxRelease_v11000
+#define PFN_cuDevicePrimaryCtxSetFlags  PFN_cuDevicePrimaryCtxSetFlags_v11000
+#define PFN_cuDevicePrimaryCtxGetState  PFN_cuDevicePrimaryCtxGetState_v7000
+#define PFN_cuDevicePrimaryCtxReset  PFN_cuDevicePrimaryCtxReset_v11000
+#define PFN_cuDeviceGetExecAffinitySupport  PFN_cuDeviceGetExecAffinitySupport_v11040
+#define PFN_cuCtxCreate  PFN_cuCtxCreate_v11040
+#define PFN_cuCtxGetId  PFN_cuCtxGetId_v12000
+#define PFN_cuCtxDestroy  PFN_cuCtxDestroy_v4000
+#define PFN_cuCtxPushCurrent  PFN_cuCtxPushCurrent_v4000
+#define PFN_cuCtxPopCurrent  PFN_cuCtxPopCurrent_v4000
+#define PFN_cuCtxSetCurrent  PFN_cuCtxSetCurrent_v4000
+#define PFN_cuCtxGetCurrent  PFN_cuCtxGetCurrent_v4000
+#define PFN_cuCtxGetDevice  PFN_cuCtxGetDevice_v2000
+#define PFN_cuCtxGetFlags  PFN_cuCtxGetFlags_v7000
+#define PFN_cuCtxSetFlags  PFN_cuCtxSetFlags_v12010
+#define PFN_cuCtxSynchronize  PFN_cuCtxSynchronize_v2000
+#define PFN_cuCtxSetLimit  PFN_cuCtxSetLimit_v3010
+#define PFN_cuCtxGetLimit  PFN_cuCtxGetLimit_v3010
+#define PFN_cuCtxGetCacheConfig  PFN_cuCtxGetCacheConfig_v3020
+#define PFN_cuCtxSetCacheConfig  PFN_cuCtxSetCacheConfig_v3020
+#define PFN_cuCtxGetSharedMemConfig  PFN_cuCtxGetSharedMemConfig_v4020
+#define PFN_cuCtxSetSharedMemConfig  PFN_cuCtxSetSharedMemConfig_v4020
+#define PFN_cuCtxGetApiVersion  PFN_cuCtxGetApiVersion_v3020
+#define PFN_cuCtxGetStreamPriorityRange  PFN_cuCtxGetStreamPriorityRange_v5050
+#define PFN_cuCtxResetPersistingL2Cache  PFN_cuCtxResetPersistingL2Cache_v11000
+#define PFN_cuCtxAttach  PFN_cuCtxAttach_v2000
+#define PFN_cuCtxDetach  PFN_cuCtxDetach_v2000
+#define PFN_cuCtxGetExecAffinity  PFN_cuCtxGetExecAffinity_v11040
+#define PFN_cuModuleLoad  PFN_cuModuleLoad_v2000
+#define PFN_cuModuleLoadData  PFN_cuModuleLoadData_v2000
+#define PFN_cuModuleLoadDataEx  PFN_cuModuleLoadDataEx_v2010
+#define PFN_cuModuleLoadFatBinary  PFN_cuModuleLoadFatBinary_v2000
+#define PFN_cuModuleUnload  PFN_cuModuleUnload_v2000
+#define PFN_cuModuleGetFunction  PFN_cuModuleGetFunction_v2000
+#define PFN_cuModuleGetGlobal  PFN_cuModuleGetGlobal_v3020
+#define PFN_cuModuleGetTexRef  PFN_cuModuleGetTexRef_v2000
+#define PFN_cuModuleGetSurfRef  PFN_cuModuleGetSurfRef_v3000
+#define PFN_cuModuleGetFunctionCount PFN_cuModuleGetFunctionCount_v12040
+#define PFN_cuModuleEnumerateFunctions PFN_cuModuleEnumerateFunctions_v12040
+#define PFN_cuLinkCreate  PFN_cuLinkCreate_v6050
+#define PFN_cuLinkAddData  PFN_cuLinkAddData_v6050
+#define PFN_cuLinkAddFile  PFN_cuLinkAddFile_v6050
+#define PFN_cuLinkComplete  PFN_cuLinkComplete_v5050
+#define PFN_cuLinkDestroy  PFN_cuLinkDestroy_v5050
+#define PFN_cuMemGetInfo  PFN_cuMemGetInfo_v3020
+#define PFN_cuMemAlloc  PFN_cuMemAlloc_v3020
+#define PFN_cuMemAllocPitch  PFN_cuMemAllocPitch_v3020
+#define PFN_cuMemFree  PFN_cuMemFree_v3020
+#define PFN_cuMemGetAddressRange  PFN_cuMemGetAddressRange_v3020
+#define PFN_cuMemAllocHost  PFN_cuMemAllocHost_v3020
+#define PFN_cuMemFreeHost  PFN_cuMemFreeHost_v2000
+#define PFN_cuMemHostAlloc  PFN_cuMemHostAlloc_v2020
+#define PFN_cuMemHostGetDevicePointer  PFN_cuMemHostGetDevicePointer_v3020
+#define PFN_cuMemHostGetFlags  PFN_cuMemHostGetFlags_v2030
+#define PFN_cuMemAllocManaged  PFN_cuMemAllocManaged_v6000
+#define PFN_cuDeviceGetByPCIBusId  PFN_cuDeviceGetByPCIBusId_v4010
+#define PFN_cuDeviceGetPCIBusId  PFN_cuDeviceGetPCIBusId_v4010
+#define PFN_cuIpcGetEventHandle  PFN_cuIpcGetEventHandle_v4010
+#define PFN_cuIpcOpenEventHandle  PFN_cuIpcOpenEventHandle_v4010
+#define PFN_cuIpcGetMemHandle  PFN_cuIpcGetMemHandle_v4010
+#define PFN_cuIpcOpenMemHandle  PFN_cuIpcOpenMemHandle_v11000
+#define PFN_cuIpcCloseMemHandle  PFN_cuIpcCloseMemHandle_v4010
+#define PFN_cuMemHostRegister  PFN_cuMemHostRegister_v6050
+#define PFN_cuMemHostUnregister  PFN_cuMemHostUnregister_v4000
+#define PFN_cuMemcpy  __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
+#define PFN_cuMemcpyPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
+#define PFN_cuMemcpyHtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
+#define PFN_cuMemcpyDtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
+#define PFN_cuMemcpyHtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
+#define PFN_cuMemcpyAtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
+#define PFN_cuMemcpy2D  __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
+#define PFN_cuMemcpy2DUnaligned  __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
+#define PFN_cuMemcpy3D  __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
+#define PFN_cuMemcpy3DPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
+#define PFN_cuMemcpyAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
+#define PFN_cuMemcpyPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
+#define PFN_cuMemcpyHtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyHtoAAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
+#define PFN_cuMemcpyAtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
+#define PFN_cuMemcpy2DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
+#define PFN_cuMemsetD8  __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
+#define PFN_cuMemsetD16  __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
+#define PFN_cuMemsetD32  __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
+#define PFN_cuMemsetD2D8  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
+#define PFN_cuMemsetD2D16  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
+#define PFN_cuMemsetD2D32  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
+#define PFN_cuMemsetD8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
+#define PFN_cuMemsetD16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
+#define PFN_cuMemsetD32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
+#define PFN_cuMemsetD2D8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
+#define PFN_cuMemsetD2D16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
+#define PFN_cuMemsetD2D32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
+#define PFN_cuArrayCreate  PFN_cuArrayCreate_v3020
+#define PFN_cuArrayGetDescriptor  PFN_cuArrayGetDescriptor_v3020
+#define PFN_cuArrayGetSparseProperties  PFN_cuArrayGetSparseProperties_v11010
+#define PFN_cuMipmappedArrayGetSparseProperties  PFN_cuMipmappedArrayGetSparseProperties_v11010
+#define PFN_cuArrayGetMemoryRequirements  PFN_cuArrayGetMemoryRequirements_v11060
+#define PFN_cuMipmappedArrayGetMemoryRequirements  PFN_cuMipmappedArrayGetMemoryRequirements_v11060
+#define PFN_cuArrayGetPlane  PFN_cuArrayGetPlane_v11020
+#define PFN_cuArrayDestroy  PFN_cuArrayDestroy_v2000
+#define PFN_cuArray3DCreate  PFN_cuArray3DCreate_v3020
+#define PFN_cuArray3DGetDescriptor  PFN_cuArray3DGetDescriptor_v3020
+#define PFN_cuMipmappedArrayCreate  PFN_cuMipmappedArrayCreate_v5000
+#define PFN_cuMipmappedArrayGetLevel  PFN_cuMipmappedArrayGetLevel_v5000
+#define PFN_cuMipmappedArrayDestroy  PFN_cuMipmappedArrayDestroy_v5000
+#define PFN_cuMemAddressReserve  PFN_cuMemAddressReserve_v10020
+#define PFN_cuMemAddressFree  PFN_cuMemAddressFree_v10020
+#define PFN_cuMemCreate  PFN_cuMemCreate_v10020
+#define PFN_cuMemRelease  PFN_cuMemRelease_v10020
+#define PFN_cuMemMap  PFN_cuMemMap_v10020
+#define PFN_cuMemMapArrayAsync  __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
+#define PFN_cuMemUnmap  PFN_cuMemUnmap_v10020
+#define PFN_cuMemSetAccess  PFN_cuMemSetAccess_v10020
+#define PFN_cuMemGetAccess  PFN_cuMemGetAccess_v10020
+#define PFN_cuMemExportToShareableHandle  PFN_cuMemExportToShareableHandle_v10020
+#define PFN_cuMemImportFromShareableHandle  PFN_cuMemImportFromShareableHandle_v10020
+#define PFN_cuMemGetAllocationGranularity  PFN_cuMemGetAllocationGranularity_v10020
+#define PFN_cuMemGetAllocationPropertiesFromHandle  PFN_cuMemGetAllocationPropertiesFromHandle_v10020
+#define PFN_cuMemRetainAllocationHandle  PFN_cuMemRetainAllocationHandle_v11000
+#define PFN_cuMemFreeAsync  __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
+#define PFN_cuMemAllocAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
+#define PFN_cuMemPoolTrimTo  PFN_cuMemPoolTrimTo_v11020
+#define PFN_cuMemPoolSetAttribute  PFN_cuMemPoolSetAttribute_v11020
+#define PFN_cuMemPoolGetAttribute  PFN_cuMemPoolGetAttribute_v11020
+#define PFN_cuMemPoolSetAccess  PFN_cuMemPoolSetAccess_v11020
+#define PFN_cuMemPoolGetAccess  PFN_cuMemPoolGetAccess_v11020
+#define PFN_cuMemPoolCreate  PFN_cuMemPoolCreate_v11020
+#define PFN_cuMemPoolDestroy  PFN_cuMemPoolDestroy_v11020
+#define PFN_cuMemAllocFromPoolAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
+#define PFN_cuMemPoolExportToShareableHandle  PFN_cuMemPoolExportToShareableHandle_v11020
+#define PFN_cuMemPoolImportFromShareableHandle  PFN_cuMemPoolImportFromShareableHandle_v11020
+#define PFN_cuMemPoolExportPointer  PFN_cuMemPoolExportPointer_v11020
+#define PFN_cuMemPoolImportPointer  PFN_cuMemPoolImportPointer_v11020
+#define PFN_cuPointerGetAttribute  PFN_cuPointerGetAttribute_v4000
+#define PFN_cuMemPrefetchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
+#define PFN_cuMemAdvise  PFN_cuMemAdvise_v8000
+#define PFN_cuMemAdvise_v2  PFN_cuMemAdvise_v12020
+#define PFN_cuMemPrefetchAsync_v2  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 12020, 12020)
+#define PFN_cuMemRangeGetAttribute  PFN_cuMemRangeGetAttribute_v8000
+#define PFN_cuMemRangeGetAttributes  PFN_cuMemRangeGetAttributes_v8000
+#define PFN_cuMulticastCreate  PFN_cuMulticastCreate_v12010
+#define PFN_cuMulticastAddDevice  PFN_cuMulticastAddDevice_v12010
+#define PFN_cuMulticastBindMem  PFN_cuMulticastBindMem_v12010
+#define PFN_cuMulticastBindAddr  PFN_cuMulticastBindAddr_v12010
+#define PFN_cuMulticastUnbind  PFN_cuMulticastUnbind_v12010
+#define PFN_cuMulticastGetGranularity  PFN_cuMulticastGetGranularity_v12010
+#define PFN_cuPointerSetAttribute  PFN_cuPointerSetAttribute_v6000
+#define PFN_cuPointerGetAttributes  PFN_cuPointerGetAttributes_v7000
+#define PFN_cuStreamCreate  PFN_cuStreamCreate_v2000
+#define PFN_cuStreamCreateWithPriority  PFN_cuStreamCreateWithPriority_v5050
+#define PFN_cuStreamGetId	__API_TYPEDEF_PTSZ(PFN_cuStreamGetId_v12000, 12000, 12000)
+#define PFN_cuStreamGetPriority  __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
+#define PFN_cuStreamGetFlags  __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
+#define PFN_cuStreamGetCtx  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
+#define PFN_cuStreamWaitEvent  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
+#define PFN_cuStreamAddCallback  __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
+#define PFN_cuStreamBeginCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
+#define PFN_cuStreamBeginCaptureToGraph  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCaptureToGraph, 12030, 12030)
+#define PFN_cuThreadExchangeStreamCaptureMode  PFN_cuThreadExchangeStreamCaptureMode_v10010
+#define PFN_cuStreamEndCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
+#define PFN_cuStreamIsCapturing  __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
+#define PFN_cuStreamGetCaptureInfo  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
+#define PFN_cuStreamGetCaptureInfo_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
+#define PFN_cuStreamGetCaptureInfo_v3  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 12030, 12030)
+#define PFN_cuStreamUpdateCaptureDependencies  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
+#define PFN_cuStreamUpdateCaptureDependencies_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 12030, 12030)
+#define PFN_cuStreamAttachMemAsync  __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
+#define PFN_cuStreamQuery  __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
+#define PFN_cuStreamSynchronize  __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
+#define PFN_cuStreamDestroy  PFN_cuStreamDestroy_v4000
+#define PFN_cuStreamCopyAttributes  __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
+#define PFN_cuStreamGetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
+#define PFN_cuStreamSetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
+#define PFN_cuEventCreate  PFN_cuEventCreate_v2000
+#define PFN_cuEventRecord  __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
+#define PFN_cuEventRecordWithFlags  __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
+#define PFN_cuEventQuery  PFN_cuEventQuery_v2000
+#define PFN_cuEventSynchronize  PFN_cuEventSynchronize_v2000
+#define PFN_cuEventDestroy  PFN_cuEventDestroy_v4000
+#define PFN_cuEventElapsedTime  PFN_cuEventElapsedTime_v2000
+#define PFN_cuImportExternalMemory  PFN_cuImportExternalMemory_v10000
+#define PFN_cuExternalMemoryGetMappedBuffer  PFN_cuExternalMemoryGetMappedBuffer_v10000
+#define PFN_cuExternalMemoryGetMappedMipmappedArray  PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
+#define PFN_cuDestroyExternalMemory  PFN_cuDestroyExternalMemory_v10000
+#define PFN_cuImportExternalSemaphore  PFN_cuImportExternalSemaphore_v10000
+#define PFN_cuSignalExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuWaitExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuDestroyExternalSemaphore  PFN_cuDestroyExternalSemaphore_v10000
+#define PFN_cuStreamWaitValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
+#define PFN_cuStreamWaitValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
+#define PFN_cuStreamWriteValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
+#define PFN_cuStreamWriteValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
+#define PFN_cuStreamBatchMemOp  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
+#define PFN_cuStreamWaitValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070)
+#define PFN_cuStreamWaitValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070)
+#define PFN_cuStreamWriteValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070)
+#define PFN_cuStreamWriteValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070)
+#define PFN_cuStreamBatchMemOp_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070)
+#define PFN_cuFuncGetAttribute  PFN_cuFuncGetAttribute_v2020
+#define PFN_cuFuncSetAttribute  PFN_cuFuncSetAttribute_v9000
+#define PFN_cuFuncSetCacheConfig  PFN_cuFuncSetCacheConfig_v3000
+#define PFN_cuFuncSetSharedMemConfig  PFN_cuFuncSetSharedMemConfig_v4020
+#define PFN_cuFuncGetName  PFN_cuFuncGetName_v12030
+#define PFN_cuFuncGetParamInfo  PFN_cuFuncGetParamInfo_v12040
+#define PFN_cuFuncIsLoaded PFN_cuFuncIsLoaded_v12040
+#define PFN_cuFuncLoad PFN_cuFuncLoad_v12040
+#define PFN_cuLaunchKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
+#define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060)
+#define PFN_cuLaunchCooperativeKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
+#define PFN_cuLaunchCooperativeKernelMultiDevice  PFN_cuLaunchCooperativeKernelMultiDevice_v9000
+#define PFN_cuLaunchHostFunc  __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
+#define PFN_cuFuncSetBlockShape  PFN_cuFuncSetBlockShape_v2000
+#define PFN_cuFuncSetSharedSize  PFN_cuFuncSetSharedSize_v2000
+#define PFN_cuParamSetSize  PFN_cuParamSetSize_v2000
+#define PFN_cuParamSeti  PFN_cuParamSeti_v2000
+#define PFN_cuParamSetf  PFN_cuParamSetf_v2000
+#define PFN_cuParamSetv  PFN_cuParamSetv_v2000
+#define PFN_cuLaunch  PFN_cuLaunch_v2000
+#define PFN_cuLaunchGrid  PFN_cuLaunchGrid_v2000
+#define PFN_cuLaunchGridAsync  PFN_cuLaunchGridAsync_v2000
+#define PFN_cuParamSetTexRef  PFN_cuParamSetTexRef_v2000
+#define PFN_cuGraphCreate  PFN_cuGraphCreate_v10000
+#define PFN_cuGraphAddKernelNode  PFN_cuGraphAddKernelNode_v12000
+#define PFN_cuGraphKernelNodeGetParams  PFN_cuGraphKernelNodeGetParams_v12000
+#define PFN_cuGraphKernelNodeSetParams  PFN_cuGraphKernelNodeSetParams_v12000
+#define PFN_cuGraphAddMemcpyNode  PFN_cuGraphAddMemcpyNode_v10000
+#define PFN_cuGraphMemcpyNodeGetParams  PFN_cuGraphMemcpyNodeGetParams_v10000
+#define PFN_cuGraphMemcpyNodeSetParams  PFN_cuGraphMemcpyNodeSetParams_v10000
+#define PFN_cuGraphAddMemsetNode  PFN_cuGraphAddMemsetNode_v10000
+#define PFN_cuGraphMemsetNodeGetParams  PFN_cuGraphMemsetNodeGetParams_v10000
+#define PFN_cuGraphMemsetNodeSetParams  PFN_cuGraphMemsetNodeSetParams_v10000
+#define PFN_cuGraphAddHostNode  PFN_cuGraphAddHostNode_v10000
+#define PFN_cuGraphHostNodeGetParams  PFN_cuGraphHostNodeGetParams_v10000
+#define PFN_cuGraphHostNodeSetParams  PFN_cuGraphHostNodeSetParams_v10000
+#define PFN_cuGraphAddChildGraphNode  PFN_cuGraphAddChildGraphNode_v10000
+#define PFN_cuGraphChildGraphNodeGetGraph  PFN_cuGraphChildGraphNodeGetGraph_v10000
+#define PFN_cuGraphAddEmptyNode  PFN_cuGraphAddEmptyNode_v10000
+#define PFN_cuGraphAddEventRecordNode  PFN_cuGraphAddEventRecordNode_v11010
+#define PFN_cuGraphEventRecordNodeGetEvent  PFN_cuGraphEventRecordNodeGetEvent_v11010
+#define PFN_cuGraphEventRecordNodeSetEvent  PFN_cuGraphEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphAddEventWaitNode  PFN_cuGraphAddEventWaitNode_v11010
+#define PFN_cuGraphEventWaitNodeGetEvent  PFN_cuGraphEventWaitNodeGetEvent_v11010
+#define PFN_cuGraphEventWaitNodeSetEvent  PFN_cuGraphEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphAddExternalSemaphoresSignalNode  PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams  PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphAddExternalSemaphoresWaitNode  PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams  PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070
+#define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070
+#define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070
+#define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070
+#define PFN_cuGraphClone  PFN_cuGraphClone_v10000
+#define PFN_cuGraphNodeFindInClone  PFN_cuGraphNodeFindInClone_v10000
+#define PFN_cuGraphNodeGetType  PFN_cuGraphNodeGetType_v10000
+#define PFN_cuGraphGetNodes  PFN_cuGraphGetNodes_v10000
+#define PFN_cuGraphGetRootNodes  PFN_cuGraphGetRootNodes_v10000
+#define PFN_cuGraphGetEdges  PFN_cuGraphGetEdges_v12030
+#define PFN_cuGraphNodeGetDependencies  PFN_cuGraphNodeGetDependencies_v12030
+#define PFN_cuGraphNodeGetDependentNodes  PFN_cuGraphNodeGetDependentNodes_v12030
+#define PFN_cuGraphAddDependencies  PFN_cuGraphAddDependencies_v12030
+#define PFN_cuGraphRemoveDependencies  PFN_cuGraphRemoveDependencies_v12030
+#define PFN_cuGraphDestroyNode  PFN_cuGraphDestroyNode_v10000
+
+#define PFN_cuGraphInstantiate  PFN_cuGraphInstantiateWithFlags_v11040
+
+#define PFN_cuGraphInstantiateWithFlags  PFN_cuGraphInstantiateWithFlags_v11040
+#define PFN_cuGraphInstantiateWithParams  __API_TYPEDEF_PTSZ(PFN_cuGraphInstantiateWithParams, 12000, 12000)
+#define PFN_cuGraphExecGetFlags  PFN_cuGraphExecGetFlags_v12000
+#define PFN_cuGraphExecKernelNodeSetParams  PFN_cuGraphExecKernelNodeSetParams_v12000
+#define PFN_cuGraphExecMemcpyNodeSetParams  PFN_cuGraphExecMemcpyNodeSetParams_v10020
+#define PFN_cuGraphExecMemsetNodeSetParams  PFN_cuGraphExecMemsetNodeSetParams_v10020
+#define PFN_cuGraphExecHostNodeSetParams  PFN_cuGraphExecHostNodeSetParams_v10020
+#define PFN_cuGraphExecChildGraphNodeSetParams  PFN_cuGraphExecChildGraphNodeSetParams_v11010
+#define PFN_cuGraphExecEventRecordNodeSetEvent  PFN_cuGraphExecEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphExecEventWaitNodeSetEvent  PFN_cuGraphExecEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphUpload  __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
+#define PFN_cuGraphLaunch  __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
+#define PFN_cuGraphExecDestroy  PFN_cuGraphExecDestroy_v10000
+#define PFN_cuGraphDestroy  PFN_cuGraphDestroy_v10000
+#define PFN_cuGraphExecUpdate  PFN_cuGraphExecUpdate_v12000
+#define PFN_cuGraphKernelNodeCopyAttributes  PFN_cuGraphKernelNodeCopyAttributes_v11000
+#define PFN_cuGraphKernelNodeGetAttribute  PFN_cuGraphKernelNodeGetAttribute_v11000
+#define PFN_cuGraphKernelNodeSetAttribute  PFN_cuGraphKernelNodeSetAttribute_v11000
+#define PFN_cuGraphDebugDotPrint  PFN_cuGraphDebugDotPrint_v11030
+#define PFN_cuGraphAddMemAllocNode  PFN_cuGraphAddMemAllocNode_v11040
+#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
+#define PFN_cuGraphAddMemFreeNode  PFN_cuGraphAddMemFreeNode_v11040
+#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
+#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
+#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
+#define PFN_cuGraphAddNode PFN_cuGraphAddNode_v12030
+#define PFN_cuGraphNodeSetParams PFN_cuGraphNodeSetParams_v12020
+#define PFN_cuGraphExecNodeSetParams PFN_cuGraphExecNodeSetParams_v12020
+#define PFN_GraphConditionalHandleCreate PFN_cuGraphConditionalHandleCreate_v12030
+#define PFN_cuDeviceGraphMemTrim  PFN_cuDeviceGraphMemTrim_v11040
+#define PFN_cuDeviceGetGraphMemAttribute  PFN_cuDeviceGetGraphMemAttribute_v11040
+#define PFN_cuDeviceSetGraphMemAttribute  PFN_cuDeviceSetGraphMemAttribute_v11040
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
+#define PFN_cuOccupancyMaxPotentialBlockSize  PFN_cuOccupancyMaxPotentialBlockSize_v6050
+#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags  PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
+#define PFN_cuOccupancyAvailableDynamicSMemPerBlock  PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
+#define PFN_cuOccupancyMaxPotentialClusterSize  PFN_cuOccupancyMaxPotentialClusterSize_v11070
+#define PFN_cuOccupancyMaxActiveClusters  PFN_cuOccupancyMaxActiveClusters_v11070
+#define PFN_cuTexRefSetArray  PFN_cuTexRefSetArray_v2000
+#define PFN_cuTexRefSetMipmappedArray  PFN_cuTexRefSetMipmappedArray_v5000
+#define PFN_cuTexRefSetAddress  PFN_cuTexRefSetAddress_v3020
+#define PFN_cuTexRefSetAddress2D  PFN_cuTexRefSetAddress2D_v4010
+#define PFN_cuTexRefSetFormat  PFN_cuTexRefSetFormat_v2000
+#define PFN_cuTexRefSetAddressMode  PFN_cuTexRefSetAddressMode_v2000
+#define PFN_cuTexRefSetFilterMode  PFN_cuTexRefSetFilterMode_v2000
+#define PFN_cuTexRefSetMipmapFilterMode  PFN_cuTexRefSetMipmapFilterMode_v5000
+#define PFN_cuTexRefSetMipmapLevelBias  PFN_cuTexRefSetMipmapLevelBias_v5000
+#define PFN_cuTexRefSetMipmapLevelClamp  PFN_cuTexRefSetMipmapLevelClamp_v5000
+#define PFN_cuTexRefSetMaxAnisotropy  PFN_cuTexRefSetMaxAnisotropy_v5000
+#define PFN_cuTexRefSetBorderColor  PFN_cuTexRefSetBorderColor_v8000
+#define PFN_cuTexRefSetFlags  PFN_cuTexRefSetFlags_v2000
+#define PFN_cuTexRefGetAddress  PFN_cuTexRefGetAddress_v3020
+#define PFN_cuTexRefGetArray  PFN_cuTexRefGetArray_v2000
+#define PFN_cuTexRefGetMipmappedArray  PFN_cuTexRefGetMipmappedArray_v5000
+#define PFN_cuTexRefGetAddressMode  PFN_cuTexRefGetAddressMode_v2000
+#define PFN_cuTexRefGetFilterMode  PFN_cuTexRefGetFilterMode_v2000
+#define PFN_cuTexRefGetFormat  PFN_cuTexRefGetFormat_v2000
+#define PFN_cuTexRefGetMipmapFilterMode  PFN_cuTexRefGetMipmapFilterMode_v5000
+#define PFN_cuTexRefGetMipmapLevelBias  PFN_cuTexRefGetMipmapLevelBias_v5000
+#define PFN_cuTexRefGetMipmapLevelClamp  PFN_cuTexRefGetMipmapLevelClamp_v5000
+#define PFN_cuTexRefGetMaxAnisotropy  PFN_cuTexRefGetMaxAnisotropy_v5000
+#define PFN_cuTexRefGetBorderColor  PFN_cuTexRefGetBorderColor_v8000
+#define PFN_cuTexRefGetFlags  PFN_cuTexRefGetFlags_v2000
+#define PFN_cuTexRefCreate  PFN_cuTexRefCreate_v2000
+#define PFN_cuTexRefDestroy  PFN_cuTexRefDestroy_v2000
+#define PFN_cuSurfRefSetArray  PFN_cuSurfRefSetArray_v3000
+#define PFN_cuSurfRefGetArray  PFN_cuSurfRefGetArray_v3000
+#define PFN_cuTexObjectCreate  PFN_cuTexObjectCreate_v5000
+#define PFN_cuTexObjectDestroy  PFN_cuTexObjectDestroy_v5000
+#define PFN_cuTexObjectGetResourceDesc  PFN_cuTexObjectGetResourceDesc_v5000
+#define PFN_cuTexObjectGetTextureDesc  PFN_cuTexObjectGetTextureDesc_v5000
+#define PFN_cuTexObjectGetResourceViewDesc  PFN_cuTexObjectGetResourceViewDesc_v5000
+#define PFN_cuSurfObjectCreate  PFN_cuSurfObjectCreate_v5000
+#define PFN_cuSurfObjectDestroy  PFN_cuSurfObjectDestroy_v5000
+#define PFN_cuSurfObjectGetResourceDesc  PFN_cuSurfObjectGetResourceDesc_v5000
+#define PFN_cuTensorMapEncodeTiled  PFN_cuTensorMapEncodeTiled_v12000
+#define PFN_cuTensorMapEncodeIm2col  PFN_cuTensorMapEncodeIm2col_v12000
+#define PFN_cuTensorMapReplaceAddress  PFN_cuTensorMapReplaceAddress_v12000
+#define PFN_cuDeviceCanAccessPeer  PFN_cuDeviceCanAccessPeer_v4000
+#define PFN_cuCtxEnablePeerAccess  PFN_cuCtxEnablePeerAccess_v4000
+#define PFN_cuCtxDisablePeerAccess  PFN_cuCtxDisablePeerAccess_v4000
+#define PFN_cuDeviceGetP2PAttribute  PFN_cuDeviceGetP2PAttribute_v8000
+#define PFN_cuGraphicsUnregisterResource  PFN_cuGraphicsUnregisterResource_v3000
+#define PFN_cuGraphicsSubResourceGetMappedArray  PFN_cuGraphicsSubResourceGetMappedArray_v3000
+#define PFN_cuGraphicsResourceGetMappedMipmappedArray  PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
+#define PFN_cuGraphicsResourceGetMappedPointer  PFN_cuGraphicsResourceGetMappedPointer_v3020
+#define PFN_cuGraphicsResourceSetMapFlags  PFN_cuGraphicsResourceSetMapFlags_v6050
+#define PFN_cuGraphicsMapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
+#define PFN_cuGraphicsUnmapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
+#define PFN_cuGetExportTable  PFN_cuGetExportTable_v3000
+#define PFN_cuFuncGetModule  PFN_cuFuncGetModule_v11000
+#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
+#define PFN_cuGetProcAddress  PFN_cuGetProcAddress_v12000
+#define PFN_cuUserObjectCreate  PFN_cuUserObjectCreate_v11030
+#define PFN_cuUserObjectRetain  PFN_cuUserObjectRetain_v11030
+#define PFN_cuUserObjectRelease  PFN_cuUserObjectRelease_v11030
+#define PFN_cuGraphRetainUserObject  PFN_cuGraphRetainUserObject_v11030
+#define PFN_cuGraphReleaseUserObject  PFN_cuGraphReleaseUserObject_v11030
+#define PFN_cuModuleGetLoadingMode  PFN_cuModuleGetLoadingMode_v11070
+#define PFN_cuMemGetHandleForAddressRange  PFN_cuMemGetHandleForAddressRange_v11070
+#define PFN_cuLibraryLoadData PFN_cuLibraryLoadData_v12000
+#define PFN_cuLibraryLoadFromFile PFN_cuLibraryLoadFromFile_v12000
+#define PFN_cuLibraryUnload PFN_cuLibraryUnload_v12000
+#define PFN_cuLibraryGetKernel PFN_cuLibraryGetKernel_v12000
+#define PFN_cuLibraryGetModule PFN_cuLibraryGetModule_v12000
+#define PFN_cuKernelGetFunction PFN_cuKernelGetFunction_v12000
+#define PFN_cuLibraryGetGlobal PFN_cuLibraryGetGlobal_v12000
+#define PFN_cuLibraryGetManaged PFN_cuLibraryGetManaged_v12000
+#define PFN_cuLibraryGetKernelCount PFN_cuLibraryGetKernelCount_v12040
+#define PFN_cuLibraryEnumerateKernels PFN_cuLibraryEnumerateKernels_v12040
+#define PFN_cuKernelGetAttribute PFN_cuKernelGetAttribute_v12000
+#define PFN_cuKernelSetAttribute PFN_cuKernelSetAttribute_v12000
+#define PFN_cuKernelSetCacheConfig PFN_cuKernelSetCacheConfig_v12000
+#define PFN_cuKernelGetName  PFN_cuKernelGetName_v12030
+#define PFN_cuKernelGetParamInfo  PFN_cuKernelGetParamInfo_v12040
+#define PFN_cuLibraryGetUnifiedFunction PFN_cuLibraryGetUnifiedFunction_v12000
+#define PFN_cuCoredumpGetAttribute PFN_cuCoredumpGetAttribute_v12010
+#define PFN_cuCoredumpGetAttributeGlobal PFN_cuCoredumpGetAttributeGlobal_v12010
+#define PFN_cuCoredumpSetAttribute PFN_cuCoredumpSetAttribute_v12010
+#define PFN_cuCoredumpSetAttributeGlobal PFN_cuCoredumpSetAttributeGlobal_v12010
+#define PFN_cuDeviceRegisterAsyncNotification PFN_cuDeviceRegisterAsyncNotification_v12040
+#define PFN_cuDeviceUnregisterAsyncNotification PFN_cuDeviceUnregisterAsyncNotification_v12040
+#define PFN_cuGreenCtxCreate PFN_cuGreenCtxCreate_v12040
+#define PFN_cuGreenCtxDestroy PFN_cuGreenCtxDestroy_v12040
+#define PFN_cuDeviceGetDevResource PFN_cuDeviceGetDevResource_v12040
+#define PFN_cuCtxGetDevResource PFN_cuCtxGetDevResource_v12040
+#define PFN_cuGreenCtxGetDevResource PFN_cuGreenCtxGetDevResource_v12040
+#define PFN_cuGreenCtxRecordEvent PFN_cuGreenCtxRecordEvent_v12040
+#define PFN_cuGreenCtxWaitEvent PFN_cuGreenCtxWaitEvent_v12040
+#define PFN_cuDevResourceGenerateDesc PFN_cuDevResourceGenerateDesc_v12040
+#define PFN_cuDevSmResourceSplitByCount PFN_cuDevSmResourceSplitByCount_v12040
+#define PFN_cuStreamGetGreenCtx PFN_cuStreamGetGreenCtx_v12040
+#define PFN_cuCtxFromGreenCtx PFN_cuCtxFromGreenCtx_v12040
+
+/*
+ * Type definitions for functions defined in cuda.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetId_v12000)(CUcontext ctx, unsigned long long *ctxId);
+typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetFlags_v12010)(unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
+typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
+typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunctionCount)(unsigned int *count, CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleEnumerateFunctions)(CUfunction *functions, unsigned int numFunctions, CUmodule mod);
+typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
+typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
+typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v12020)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUmemLocation_v1 location);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v12020_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUmemLocation_v1 location, unsigned int flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMulticastCreate_v12010)(CUmemGenericAllocationHandle *mcHandle, const CUmulticastObjectProp *prop);
+typedef CUresult (CUDAAPI *PFN_cuMulticastAddDevice_v12010)(CUmemGenericAllocationHandle mcHandle, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuMulticastBindMem_v12010)(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMulticastBindAddr_v12010)(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMulticastUnbind_v12010)(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMulticastGetGranularity_v12010)(size_t *granularity, const CUmulticastObjectProp *prop, CUmulticastGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetId_v12000)(CUstream hStream, unsigned long long *streamId);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetId_v12000_ptsz)(CUstream hStream, unsigned long long *streamId);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCaptureToGraph_v12030_ptsz)(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v12030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v12030_ptsz)(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetName_v12030)(const char **name, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetParamInfo_v12040)(CUfunction func, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+typedef CUresult (CUDAAPI *PFN_cuFuncIsLoaded_v12040)(CUfunctionLoadingState *state, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncLoad_v12040)(CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v12000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v12000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v12000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v12030)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, CUgraphEdgeData *edgeData, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v12030)(CUgraphNode hNode, CUgraphNode *dependencies, CUgraphEdgeData *edgeData, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v12030)(CUgraphNode hNode, CUgraphNode *dependentNodes, CUgraphEdgeData *edgeData, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v12030)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v12030)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithParams_v12000_ptsz)(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecGetFlags_v12000)(CUgraphExec hGraphExec, cuuint64_t *flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v12000)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v12000)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo *resultInfo);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddNode_v12020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddNode_v12030)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetParams_v12020)(CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecNodeSetParams_v12020)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphConditionalHandleCreate_v12030)(CUgraphConditionalHandle *pHandle_out, CUgraph hGraph, CUcontext ctx, unsigned int defaultLaunchValue, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeTiled_v12000)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeIm2col_v12000)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapReplaceAddress_v12000)(CUtensorMap *tensorMap, void *globalAddress);
+typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
+typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v12000)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolFound);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v12020)(CUdeviceptr_v2 devPtr, size_t count, CUmemLocation_v1 location, unsigned int flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCaptureToGraph_v12030)(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v12030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v12030)(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithParams_v12000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuLibraryLoadData_v12000)(CUlibrary *library, const void *code, CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, CUlibraryOption *libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions);
+typedef CUresult (CUDAAPI *PFN_cuLibraryLoadFromFile_v12000)(CUlibrary *library, const char *fileName, CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, CUlibraryOption *libraryOptions, void **libraryOptionValues, unsigned int numLibraryOptions);
+typedef CUresult (CUDAAPI *PFN_cuLibraryUnload_v12000)(CUlibrary library);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetKernel_v12000)(CUkernel *pKernel, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetModule_v12000)(CUmodule *pMod, CUlibrary library);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetKernelCount)(unsigned int *count, CUlibrary lib);
+typedef CUresult (CUDAAPI *PFN_cuLibraryEnumerateKernels)(CUkernel *kernels, unsigned int numKernels, CUlibrary lib);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetFunction_v12000)(CUfunction *pFunc, CUkernel kernel);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetGlobal_v12000)(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetManaged_v12000)(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetAttribute_v12000)(int *pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelSetAttribute_v12000)(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelSetCacheConfig_v12000)(CUkernel kernel, CUfunc_cache config, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetName_v12030)(const char **name, CUkernel hfunc);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetParamInfo_v12040)(CUkernel kernel, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetUnifiedFunction_v12000)(void **fptr, CUlibrary library, const char *symbol);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpGetAttribute_v12010)(CUcoredumpSettings get, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpGetAttributeGlobal_v12010)(CUcoredumpSettings get, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpSetAttribute_v12010)(CUcoredumpSettings set, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpSetAttributeGlobal_v12010)(CUcoredumpSettings set, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuDeviceRegisterAsyncNotification_v12040)(CUdevice device, CUasyncCallback callbackFunc, void *userData, CUasyncCallbackHandle *callback);
+typedef CUresult(CUDAAPI *PFN_cuDeviceUnregisterAsyncNotification_v12040)(CUdevice device, CUasyncCallbackHandle callback);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxCreate_v12040)(CUgreenCtx* phCtx, CUdevResourceDesc desc, CUdevice dev, unsigned int flags);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxDestroy_v12040)(CUgreenCtx hCtx);
+typedef CUresult(CUDAAPI *PFN_cuDeviceGetDevResource_v12040)(CUdevice dev, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuCtxGetDevResource_v12040)(CUcontext hCtx, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxGetDevResource_v12040)(CUgreenCtx hCtx, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxRecordEvent_v12040)(CUgreenCtx hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxWaitEvent_v12040)(CUgreenCtx hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuDevResourceGenerateDesc_v12040)(CUdevResourceDesc* phDesc, CUdevResource* resources, unsigned int nbResources);
+typedef CUresult(CUDAAPI *PFN_cuDevSmResourceSplitByCount_v12040)(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount);
+typedef CUresult(CUDAAPI *PFN_cuStreamGetGreenCtx_v12040)(CUstream hStream, CUgreenCtx *phCtx);
+typedef CUresult(CUDAAPI *PFN_cuCtxFromGreenCtx_v12040)(CUcontext *pContext, CUgreenCtx hCtx);
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
+    typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+    typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..df64a8afa14f695bb05810266ac40b227c078cc5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h
@@ -0,0 +1,514 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_GL_INTEROP_H__)
+#define __CUDA_GL_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+#if defined(__APPLE__)
+
+#include <OpenGL/gl.h>
+
+#else /* __APPLE__ */
+
+#if defined(__arm__) || defined(__aarch64__)
+#ifndef GL_VERSION
+#error Please include the appropriate gl headers before including cuda_gl_interop.h
+#endif
+#else
+#include <GL/gl.h>
+#endif
+
+#endif /* __APPLE__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_OPENGL OpenGL Interoperability
+ * This section describes the OpenGL interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of OpenGL
+ * resources is performed with the graphics API agnostic, resource mapping 
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to the current OpenGL context
+ */
+enum cudaGLDeviceList
+{
+  cudaGLDeviceListAll           = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+  cudaGLDeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+  cudaGLDeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame  */
+};
+
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
+ * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to 
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the 
+ *                           current OpenGL context
+ * \param pCudaDevices     - Returned CUDA devices corresponding to the current 
+ *                           OpenGL context
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaGLDeviceListAll for all devices, 
+ *                           ::cudaGLDeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaGLDeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorInvalidGraphicsContext,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ *
+ * \note This function is not supported on Mac OS X.
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGLGetDevices 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
+
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p flags specify the intended usage, as follows: 
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param image    - name of texture or renderbuffer object to be registered
+ * \param target   - Identifies the type of object specified by \p image 
+ * \param flags    - Register flags
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * resource.  The register flags \p flags specify the intended usage,
+ * as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param buffer   - name of buffer object to be registered
+ * \param flags    - Register flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsGLRegisterBuffer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
+
+#ifdef _WIN32
+#ifndef WGL_NV_gpu_affinity
+typedef void* HGPUNV;
+#endif
+
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns the CUDA device associated with a hGpu, if applicable.
+ *
+ * \param device - Returns the device associated with hGpu, or -1 if hGpu is
+ * not a compute device.
+ * \param hGpu   - Handle to a GPU, as queried via WGL_NV_gpu_affinity
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::WGL_NV_gpu_affinity,
+ * ::cuWGLGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
+#endif
+
+/** @} */ /* END CUDART_OPENGL */
+
+/**
+ * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+
+/**
+ * CUDA GL Map Flags
+ */
+enum cudaGLMapFlags
+{
+  cudaGLMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+  cudaGLMapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
+  cudaGLMapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
+};
+
+/**
+ * \brief Sets a CUDA device to use OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * This function will immediately initialize the primary context on 
+ * \p device if needed.
+ *
+ * \param device - Device to use for OpenGL interoperability
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
+
+/**
+ * \brief Registers a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Registers the buffer object of ID \p bufObj for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  The OpenGL context used to create the buffer, or another
+ * context from the same share group, must be bound to the current
+ * thread when this is called.
+ *
+ * \param bufObj - Buffer object ID to register
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
+
+/**
+ * \brief Unregisters a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unregisters the buffer object of ID \p bufObj for access by CUDA
+ * and releases any CUDA resources associated with the buffer.  Once a
+ * buffer is unregistered, it may no longer be mapped by CUDA.  The GL
+ * context used to create the buffer, or another context from the
+ * same share group, must be bound to the current thread when this is
+ * called.
+ *
+ * \param bufObj - Buffer object to unregister
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnregisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Set usage flags for mapping an OpenGL buffer
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Set flags for mapping the OpenGL buffer \p bufObj
+ *
+ * Changes to flags will take effect the next time \p bufObj is mapped.
+ * The \p flags argument may be any of the following:
+ *
+ * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
+ * be used. It is therefore assumed that this buffer will be read from and
+ * written to by CUDA kernels. This is the default value.
+ * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
+ * buffer will not write to the buffer.
+ * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
+ * this buffer will not read from the buffer and will write over the
+ * entire contents of the buffer, so none of the data previously stored in
+ * the buffer will be preserved.
+ *
+ * If \p bufObj has not been registered for use with CUDA, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
+ *
+ * \param bufObj    - Registered buffer object to set flags for
+ * \param flags     - Parameters for buffer mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceSetMapFlags
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags); 
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
+
+/** @} */ /* END CUDART_OPENGL_DEPRECATED */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#undef __CUDA_DEPRECATED
+
+#endif /* __CUDA_GL_INTEROP_H__ */
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..64c92003b6f30d4050eb7f888db3b74e99492f24
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h
@@ -0,0 +1,14059 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+
+#if !defined(__CUDA_RUNTIME_API_H__)
+#define __CUDA_RUNTIME_API_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__
+#endif
+
+/**
+ * \latexonly
+ * \page sync_async API synchronization behavior
+ *
+ * \section memcpy_sync_async_behavior Memcpy
+ * The API provides memcpy/memset functions in both synchronous and asynchronous forms,
+ * the latter having an \e "Async" suffix. This is a misnomer as each function
+ * may exhibit synchronous or asynchronous behavior depending on the arguments
+ * passed to the function. In the reference documentation, each memcpy function is
+ * categorized as \e synchronous or \e asynchronous, corresponding to the definitions
+ * below.
+ * 
+ * \subsection MemcpySynchronousBehavior Synchronous
+ * 
+ * <ol>
+ * <li> For transfers from pageable host memory to device memory, a stream sync is performed
+ * before the copy is initiated. The function will return once the pageable
+ * buffer has been copied to the staging memory for DMA transfer to device memory,
+ * but the DMA to final destination may not have completed.
+ * 
+ * <li> For transfers from pinned host memory to device memory, the function is synchronous
+ * with respect to the host.
+ *
+ * <li> For transfers from device to either pageable or pinned host memory, the function returns
+ * only once the copy has completed.
+ * 
+ * <li> For transfers from device memory to device memory, no host-side synchronization is
+ * performed.
+ *
+ * <li> For transfers from any host memory to any host memory, the function is fully
+ * synchronous with respect to the host.
+ * </ol>
+ * 
+ * \subsection MemcpyAsynchronousBehavior Asynchronous
+ *
+ * <ol>
+ * <li> For transfers between device memory and pageable host memory, the function might 
+ * be synchronous with respect to host.
+ *
+ * <li> For transfers from any host memory to any host memory, the function is fully
+ * synchronous with respect to the host.
+ * 
+ * <li> If pageable memory must first be staged to pinned memory, the driver may
+ * synchronize with the stream and stage the copy into pinned memory.
+ *
+ * <li> For all other transfers, the function should be fully asynchronous.
+ * </ol>
+ *
+ * \section memset_sync_async_behavior Memset
+ * The cudaMemset functions are asynchronous with respect to the host
+ * except when the target memory is pinned host memory. The \e Async
+ * versions are always asynchronous with respect to the host.
+ *
+ * \section kernel_launch_details Kernel Launches
+ * Kernel launches are asynchronous with respect to the host. Details of
+ * concurrent kernel execution and data transfers can be found in the CUDA
+ * Programmers Guide.
+ *
+ * \endlatexonly
+ */
+
+/**
+ * There are two levels for the runtime API.
+ *
+ * The C API (<i>cuda_runtime_api.h</i>) is
+ * a C-style interface that does not require compiling with \p nvcc.
+ *
+ * The \ref CUDART_HIGHLEVEL "C++ API" (<i>cuda_runtime.h</i>) is a
+ * C++-style interface built on top of the C API. It wraps some of the
+ * C API routines, using overloading, references and default arguments.
+ * These wrappers can be used from C++ code and can be compiled with any C++
+ * compiler. The C++ API also has some CUDA-specific wrappers that wrap
+ * C API routines that deal with symbols, textures, and device functions.
+ * These wrappers require the use of \p nvcc because they depend on code being
+ * generated by the compiler. For example, the execution configuration syntax
+ * to invoke kernels is only available in source code compiled with \p nvcc.
+ */
+
+/** CUDA Runtime API Version */
+#define CUDART_VERSION  12040
+
+#if defined(__CUDA_API_VER_MAJOR__) && defined(__CUDA_API_VER_MINOR__)
+# define __CUDART_API_VERSION ((__CUDA_API_VER_MAJOR__ * 1000) + (__CUDA_API_VER_MINOR__ * 10))
+#else
+# define __CUDART_API_VERSION CUDART_VERSION
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+#include "builtin_types.h"
+
+#if !defined(__CUDACC_RTC_MINIMAL__) && ((defined(__CUDACC_RDC__)  || defined(__CUDACC_EWP__) || !defined(__CUDACC_RTC__)))
+#include "cuda_device_runtime_api.h"
+#endif /* !defined(__CUDACC_RTC_MINIMAL__) && (defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__) || !defined(__CUDACC_RTC__)) */
+
+
+#ifndef __CUDACC_RTC_MINIMAL__
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL)
+    #define __CUDART_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDART_API_PTDS(api) api ## _ptds
+    #define __CUDART_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDART_API_PTDS(api) api
+    #define __CUDART_API_PTSZ(api) api
+#endif
+
+#define cudaSignalExternalSemaphoresAsync  __CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2)
+#define cudaWaitExternalSemaphoresAsync    __CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2)
+
+    #define cudaStreamGetCaptureInfo       __CUDART_API_PTSZ(cudaStreamGetCaptureInfo_v2)
+
+#define cudaGetDeviceProperties cudaGetDeviceProperties_v2
+
+#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+    #define cudaMemcpy                     __CUDART_API_PTDS(cudaMemcpy)
+    #define cudaMemcpyToSymbol             __CUDART_API_PTDS(cudaMemcpyToSymbol)
+    #define cudaMemcpyFromSymbol           __CUDART_API_PTDS(cudaMemcpyFromSymbol)
+    #define cudaMemcpy2D                   __CUDART_API_PTDS(cudaMemcpy2D)
+    #define cudaMemcpyToArray              __CUDART_API_PTDS(cudaMemcpyToArray)
+    #define cudaMemcpy2DToArray            __CUDART_API_PTDS(cudaMemcpy2DToArray)
+    #define cudaMemcpyFromArray            __CUDART_API_PTDS(cudaMemcpyFromArray)
+    #define cudaMemcpy2DFromArray          __CUDART_API_PTDS(cudaMemcpy2DFromArray)
+    #define cudaMemcpyArrayToArray         __CUDART_API_PTDS(cudaMemcpyArrayToArray)
+    #define cudaMemcpy2DArrayToArray       __CUDART_API_PTDS(cudaMemcpy2DArrayToArray)
+    #define cudaMemcpy3D                   __CUDART_API_PTDS(cudaMemcpy3D)
+    #define cudaMemcpy3DPeer               __CUDART_API_PTDS(cudaMemcpy3DPeer)
+    #define cudaMemset                     __CUDART_API_PTDS(cudaMemset)
+    #define cudaMemset2D                   __CUDART_API_PTDS(cudaMemset2D)
+    #define cudaMemset3D                   __CUDART_API_PTDS(cudaMemset3D)
+    #define cudaGraphInstantiateWithParams __CUDART_API_PTSZ(cudaGraphInstantiateWithParams)
+    #define cudaGraphUpload                __CUDART_API_PTSZ(cudaGraphUpload)
+    #define cudaGraphLaunch                __CUDART_API_PTSZ(cudaGraphLaunch)
+    #define cudaStreamBeginCapture         __CUDART_API_PTSZ(cudaStreamBeginCapture)
+    #define cudaStreamBeginCaptureToGraph  __CUDART_API_PTSZ(cudaStreamBeginCaptureToGraph)
+    #define cudaStreamEndCapture           __CUDART_API_PTSZ(cudaStreamEndCapture)
+    #define cudaStreamGetCaptureInfo_v3    __CUDART_API_PTSZ(cudaStreamGetCaptureInfo_v3)
+    #define cudaStreamUpdateCaptureDependencies  __CUDART_API_PTSZ(cudaStreamUpdateCaptureDependencies)
+    #define cudaStreamUpdateCaptureDependencies_v2  __CUDART_API_PTSZ(cudaStreamUpdateCaptureDependencies_v2)
+    #define cudaStreamIsCapturing          __CUDART_API_PTSZ(cudaStreamIsCapturing)
+    #define cudaMemcpyAsync                __CUDART_API_PTSZ(cudaMemcpyAsync)
+    #define cudaMemcpyToSymbolAsync        __CUDART_API_PTSZ(cudaMemcpyToSymbolAsync)
+    #define cudaMemcpyFromSymbolAsync      __CUDART_API_PTSZ(cudaMemcpyFromSymbolAsync)
+    #define cudaMemcpy2DAsync              __CUDART_API_PTSZ(cudaMemcpy2DAsync)
+    #define cudaMemcpyToArrayAsync         __CUDART_API_PTSZ(cudaMemcpyToArrayAsync)
+    #define cudaMemcpy2DToArrayAsync       __CUDART_API_PTSZ(cudaMemcpy2DToArrayAsync)
+    #define cudaMemcpyFromArrayAsync       __CUDART_API_PTSZ(cudaMemcpyFromArrayAsync)
+    #define cudaMemcpy2DFromArrayAsync     __CUDART_API_PTSZ(cudaMemcpy2DFromArrayAsync)
+    #define cudaMemcpy3DAsync              __CUDART_API_PTSZ(cudaMemcpy3DAsync)
+    #define cudaMemcpy3DPeerAsync          __CUDART_API_PTSZ(cudaMemcpy3DPeerAsync)
+    #define cudaMemsetAsync                __CUDART_API_PTSZ(cudaMemsetAsync)
+    #define cudaMemset2DAsync              __CUDART_API_PTSZ(cudaMemset2DAsync)
+    #define cudaMemset3DAsync              __CUDART_API_PTSZ(cudaMemset3DAsync)
+    #define cudaStreamQuery                __CUDART_API_PTSZ(cudaStreamQuery)
+    #define cudaStreamGetFlags             __CUDART_API_PTSZ(cudaStreamGetFlags)
+    #define cudaStreamGetId                __CUDART_API_PTSZ(cudaStreamGetId)
+    #define cudaStreamGetPriority          __CUDART_API_PTSZ(cudaStreamGetPriority)
+    #define cudaEventRecord                __CUDART_API_PTSZ(cudaEventRecord)
+    #define cudaEventRecordWithFlags       __CUDART_API_PTSZ(cudaEventRecordWithFlags)
+    #define cudaStreamWaitEvent            __CUDART_API_PTSZ(cudaStreamWaitEvent)
+    #define cudaStreamAddCallback          __CUDART_API_PTSZ(cudaStreamAddCallback)
+    #define cudaStreamAttachMemAsync       __CUDART_API_PTSZ(cudaStreamAttachMemAsync)
+    #define cudaStreamSynchronize          __CUDART_API_PTSZ(cudaStreamSynchronize)
+    #define cudaLaunchKernel               __CUDART_API_PTSZ(cudaLaunchKernel)
+    #define cudaLaunchKernelExC            __CUDART_API_PTSZ(cudaLaunchKernelExC)
+    #define cudaLaunchHostFunc             __CUDART_API_PTSZ(cudaLaunchHostFunc)
+    #define cudaMemPrefetchAsync           __CUDART_API_PTSZ(cudaMemPrefetchAsync)
+    #define cudaMemPrefetchAsync_v2        __CUDART_API_PTSZ(cudaMemPrefetchAsync_v2)
+    #define cudaLaunchCooperativeKernel    __CUDART_API_PTSZ(cudaLaunchCooperativeKernel)
+    #define cudaStreamCopyAttributes       __CUDART_API_PTSZ(cudaStreamCopyAttributes)
+    #define cudaStreamGetAttribute         __CUDART_API_PTSZ(cudaStreamGetAttribute)
+    #define cudaStreamSetAttribute         __CUDART_API_PTSZ(cudaStreamSetAttribute)
+    #define cudaMallocAsync                __CUDART_API_PTSZ(cudaMallocAsync)
+    #define cudaFreeAsync                  __CUDART_API_PTSZ(cudaFreeAsync)
+    #define cudaMallocFromPoolAsync        __CUDART_API_PTSZ(cudaMallocFromPoolAsync)
+    #define cudaGetDriverEntryPoint        __CUDART_API_PTSZ(cudaGetDriverEntryPoint)
+#endif
+
+#endif  /* __CUDACC_RTC_MINIMAL__ */
+
+/** \cond impl_private */
+#if !defined(__dv)
+
+#if defined(__cplusplus)
+
+#define __dv(v) \
+        = v
+
+#else /* __cplusplus */
+
+#define __dv(v)
+
+#endif /* __cplusplus */
+
+#endif /* !__dv */
+/** \endcond impl_private */
+
+#if (defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350))   /** Visible to SM>=3.5 and "__host__ __device__" only **/
+
+#define CUDART_DEVICE __device__ 
+
+#else
+
+#define CUDART_DEVICE
+
+#endif /** CUDART_DEVICE */
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \defgroup CUDART_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Destroy all allocations and reset all state on the current device
+ * in the current process.
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process. It is the caller's responsibility to ensure
+ * that the resources are not accessed or passed in subsequent API calls and
+ * doing so will result in undefined behavior. These resources include CUDA types
+ * such as ::cudaStream_t, ::cudaEvent_t, ::cudaArray_t, ::cudaMipmappedArray_t,
+ * ::cudaTextureObject_t, ::cudaSurfaceObject_t, ::textureReference, ::surfaceReference,
+ * ::cudaExternalMemory_t, ::cudaExternalSemaphore_t and ::cudaGraphicsResource_t.
+ * Any subsequent API call to this device will reinitialize the device.
+ *
+ * Note that this function will reset the device immediately.  It is the caller's
+ * responsibility to ensure that the device is not being accessed by any 
+ * other host threads from the process when this function is called.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void);
+
+/**
+ * \brief Wait for compute device to finish
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cudaDeviceSynchronize() returns an error if one of the preceding tasks
+ * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
+ * this device, the host thread will block until the device has finished 
+ * its work.
+ *
+ * \return
+ * ::cudaSuccess
+ * \note_device_sync_deprecated
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceReset,
+ * ::cuCtxSynchronize
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the device.  The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc).  The application can use ::cudaDeviceGetLimit() to find out
+ * exactly what the limit has been set to.
+ *
+ * Setting each ::cudaLimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread.
+ *
+ * - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO
+ *   used by the ::printf() device system call. Setting
+ *   ::cudaLimitPrintfFifoSize must not be performed after launching any kernel
+ *   that uses the ::printf() device system call - in such case
+ *   ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by
+ *   the ::malloc() and ::free() device system calls. Setting
+ *   ::cudaLimitMallocHeapSize must not be performed after launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls - in such case
+ *   ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a
+ *   grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the runtime to reserve large amounts of
+ *   device memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cudaDeviceSetLimit will return
+ *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability < 9.0.
+ *   Attempting to set this limit on devices of other compute capability will
+ *   results in error ::cudaErrorUnsupportedLimit being returned.
+ *
+ * - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   device. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the runtime to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cudaDeviceSetLimit will return
+ *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
+ *   returned.
+ *
+ * - ::cudaLimitMaxL2FetchGranularity controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performance hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * - ::cudaLimitPersistingL2CacheSize controls size in bytes available
+ *   for persisting L2 cache. This is purely a performance hint and it
+ *   can be ignored or clamped depending on the platform.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetLimit,
+ * ::cuCtxSetLimit
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
+
+/**
+ * \brief Return resource limits
+ *
+ * Returns in \p *pValue the current size of \p limit. The following ::cudaLimit values are supported.
+ * - ::cudaLimitStackSize is the stack size in bytes of each GPU thread.
+ * - ::cudaLimitPrintfFifoSize is the size in bytes of the shared FIFO used by the
+ *   ::printf() device system call.
+ * - ::cudaLimitMallocHeapSize is the size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::cudaLimitDevRuntimeSyncDepth is the maximum grid depth at which a
+ *   thread can isssue the device runtime call ::cudaDeviceSynchronize()
+ *   to wait on child grid launches to complete. This functionality is removed
+ *   for devices of compute capability >= 9.0, and hence will return error
+ *   ::cudaErrorUnsupportedLimit on such devices.
+ * - ::cudaLimitDevRuntimePendingLaunchCount is the maximum number of outstanding
+ *   device runtime launches.
+ * - ::cudaLimitMaxL2FetchGranularity is the L2 cache fetch granularity.
+ * - ::cudaLimitPersistingL2CacheSize is the persisting L2 cache size in bytes.
+ *
+ * \param limit  - Limit to query
+ * \param pValue - Returned size of the limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceSetLimit,
+ * ::cuCtxGetLimit
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+
+/**
+ * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
+ *
+ * Returns in \p maxWidthInElements the maximum number of elements allocatable in a 1D linear texture
+ * for given format descriptor \p fmtDesc.
+ *
+ * \param maxWidthInElements    - Returns maximum number of texture elements allocatable for given \p fmtDesc.
+ * \param fmtDesc               - Texture format description.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuDeviceGetTexture1DLinearMaxWidth
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc, int device);
+#endif
+
+/**
+ * \brief Returns the preferred cache configuration for the current device.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this returns through \p pCacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute functions.
+ *
+ * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param pCacheConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * ::cuCtxGetCacheConfig
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cudaStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cudaDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/**
+ * \brief Sets the preferred cache configuration for the current device.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute the function. Any
+ * function preference set via
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
+ * or
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
+ * will be preferred over this device-wide setting. Setting the device-wide
+ * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
+ * kernel launches to prefer to not change the cache configuration unless
+ * required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetCacheConfig,
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * ::cuCtxSetCacheConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device ordinal given a PCI bus ID string.
+ *
+ * \param device   - Returned device ordinal
+ *
+ * \param pciBusId - String in one of the following forms: 
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetPCIBusId,
+ * ::cuDeviceGetByPCIBusId
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param device   - Device to get identifier string for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetByPCIBusId,
+ * ::cuDeviceGetPCIBusId
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been 
+ * created with the ::cudaEventInterprocess and ::cudaEventDisableTiming
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cudaIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been been opened in the importing process, 
+ * ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and 
+ * ::cudaEventQuery may be used in either process. Performing operations 
+ * on the imported event after the exported event has been freed 
+ * with ::cudaEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param handle - Pointer to a user allocated cudaIpcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::cudaEventInterprocess and 
+ *                    ::cudaEventDisableTiming flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaEventCreate,
+ * ::cudaEventDestroy,
+ * ::cudaEventSynchronize,
+ * ::cudaEventQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcGetEventHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with 
+ * ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like 
+ * a locally created event with the ::cudaEventDisableTiming flag specified. 
+ * This event must be freed with ::cudaEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has 
+ * been freed with ::cudaEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param event - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorDeviceUninitialized
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaEventCreate,
+ * ::cudaEventDestroy,
+ * ::cudaEventSynchronize,
+ * ::cudaEventQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcOpenEventHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle);
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ *          allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created 
+ * with ::cudaMalloc and exports it for use in another process. This is a 
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects. 
+ *
+ * If a region of memory is freed with ::cudaFree and a subsequent call
+ * to ::cudaMalloc returns memory with the same device address,
+ * ::cudaIpcGetMemHandle will return a unique handle for the
+ * new memory. 
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param handle - Pointer to user allocated ::cudaIpcMemHandle to return
+ *                    the handle in.
+ * \param devPtr - Base pointer to previously allocated device memory 
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcGetMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ *          and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cudaIpcGetMemHandle into
+ * the current device address space. For contexts on different devices 
+ * ::cudaIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is 
+ * controlled by the ::cudaIpcMemLazyEnablePeerAccess flag. 
+ * ::cudaDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * ::cudaIpcOpenMemHandle can open handles to devices that may not be visible
+ * in the process calling the API.
+ *
+ * Contexts that may open ::cudaIpcMemHandles are restricted in the following way.
+ * ::cudaIpcMemHandles from each device in a given process may only be opened 
+ * by one context per device per other process.
+ *
+ * If the memory handle has already been opened by the current context, the
+ * reference count on the handle is incremented by 1 and the existing device pointer
+ * is returned.
+ *
+ * Memory returned from ::cudaIpcOpenMemHandle must be freed with
+ * ::cudaIpcCloseMemHandle.
+ *
+ * Calling ::cudaFree on an exported memory region before calling
+ * ::cudaIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ * 
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param devPtr - Returned device pointer
+ * \param handle - ::cudaIpcMemHandle to open
+ * \param flags  - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorTooManyPeers,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note No guarantees are made about the address returned in \p *devPtr.  
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceCanAccessPeer,
+ * ::cuIpcOpenMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags);
+
+/**
+ * \brief Attempts to close memory mapped with cudaIpcOpenMemHandle
+ * 
+ * Decrements the reference count of the memory returnd by ::cudaIpcOpenMemHandle by 1.
+ * When the reference count reaches 0, this API unmaps the memory. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * Users can test their device for IPC functionality by calling
+ * ::cudaDeviceGetAttribute with ::cudaDevAttrIpcEventSupport
+ *
+ * \param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle
+ * 
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr);
+
+/**
+ * \brief Blocks until remote writes are visible to the specified scope
+ *
+ * Blocks until remote writes to the target context via mappings created
+ * through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
+ * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ * visible to the specified scope.
+ *
+ * If the scope equals or lies within the scope indicated by
+ * ::cudaDevAttrGPUDirectRDMAWritesOrdering, the call will be a no-op and
+ * can be safely omitted for performance. This can be determined by
+ * comparing the numerical values between the two enums, with smaller
+ * scopes having smaller values.
+ *
+ * Users may query support for this API via ::cudaDevAttrGPUDirectRDMAFlushWritesOptions.
+ *
+ * \param target - The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
+ * \param scope  - The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuFlushGPUDirectRDMAWrites
+ */
+#if __CUDART_API_VERSION >= 11030
+extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWritesTarget target, enum cudaFlushGPUDirectRDMAWritesScope scope);
+#endif
+
+/**
+* \brief Registers a callback function to receive async notifications
+*
+* Registers \p callbackFunc to receive async notifications.
+*
+* The \p userData parameter is passed to the callback function at async notification time.
+* Likewise, \p callback is also passed to the callback function to distinguish between
+* multiple registered callbacks.
+*
+* The callback function being registered should be designed to return quickly (~10ms).
+* Any long running tasks should be queued for execution on an application thread.
+*
+* Callbacks may not call cudaDeviceRegisterAsyncNotification or cudaDeviceUnregisterAsyncNotification.
+* Doing so will result in ::cudaErrorNotPermitted. Async notification callbacks execute
+* in an undefined order and may be serialized.
+*
+* Returns in \p *callback a handle representing the registered callback instance.
+*
+* \param device - The device on which to register the callback
+* \param callbackFunc - The function to register as a callback
+* \param userData - A generic pointer to user data. This is passed into the callback function.
+* \param callback - A handle representing the registered callback instance
+*
+* \return
+* ::cudaSuccess
+* ::cudaErrorNotSupported
+* ::cudaErrorInvalidDevice
+* ::cudaErrorInvalidValue
+* ::cudaErrorNotPermitted
+* ::cudaErrorUnknown
+* \notefnerr
+*
+* \sa
+* ::cudaDeviceUnregisterAsyncNotification
+*/
+extern __host__ cudaError_t CUDARTAPI cudaDeviceRegisterAsyncNotification(int device, cudaAsyncCallback callbackFunc, void* userData, cudaAsyncCallbackHandle_t* callback);
+
+/**
+* \brief Unregisters an async notification callback
+*
+* Unregisters \p callback so that the corresponding callback function will stop receiving
+* async notifications.
+*
+* \param device - The device from which to remove \p callback.
+* \param callback - The callback instance to unregister from receiving async notifications.
+*
+* \return
+* ::cudaSuccess
+* ::cudaErrorNotSupported
+* ::cudaErrorInvalidDevice
+* ::cudaErrorInvalidValue
+* ::cudaErrorNotPermitted
+* ::cudaErrorUnknown
+* \notefnerr
+*
+* \sa
+* ::cudaDeviceRegisterAsyncNotification
+*/
+extern __host__ cudaError_t CUDARTAPI cudaDeviceUnregisterAsyncNotification(int device, cudaAsyncCallbackHandle_t callback);
+
+/** @} */ /* END CUDART_DEVICE */
+
+/**
+ * \defgroup CUDART_DEVICE_DEPRECATED Device Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated device management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated device management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the shared memory configuration for the current device.
+ *
+ * \deprecated
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * on the current device. On devices with configurable shared memory banks, 
+ * ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all 
+ * subsequent kernel launches will by default use the new bank size. When 
+ * ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared 
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes.
+ * - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes.
+ *
+ * \param pConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig
+ */
+extern __CUDA_DEPRECATED __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current device.
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the shared memory bank size which is used for all subsequent kernel launches.
+ * Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig
+ * will override the device wide setting.
+ *
+ * Changing the shared memory configuration between launches may introduce
+ * a device side synchronization point.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance. 
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank 
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently,
+ *   four bytes)
+ * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes
+ *   natively.
+ * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
+ *   bytes natively.
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuCtxSetSharedMemConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config);
+/** @} */ /* END CUDART_DEVICE_DEPRECATED */
+
+/**
+ * \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated thread management functions of the CUDA runtime
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated thread management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Exit and clean up from CUDA launches
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceReset(), which should be used
+ * instead.
+ *
+ * Explicitly destroys all cleans up all resources associated with the current
+ * device in the current process.  Any subsequent API call to this device will 
+ * reinitialize the device.  
+ *
+ * Note that this function will reset the device immediately.  It is the caller's
+ * responsibility to ensure that the device is not being accessed by any 
+ * other host threads from the process when this function is called.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceReset
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void);
+
+/**
+ * \brief Wait for compute device to finish
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is similar to the 
+ * non-deprecated function ::cudaDeviceSynchronize(), which should be used
+ * instead.
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cudaThreadSynchronize() returns an error if one of the preceding tasks
+ * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
+ * this device, the host thread will block until the device has finished 
+ * its work.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSynchronize
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceSetLimit(), which should be used
+ * instead.
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the device.  The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc).  The application can use ::cudaThreadGetLimit() to find out
+ * exactly what the limit has been set to.
+ *
+ * Setting each ::cudaLimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::cudaLimitStackSize controls the stack size of each GPU thread.
+ *
+ * - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO
+ *   used by the ::printf() device system call.
+ *   Setting ::cudaLimitPrintfFifoSize must be performed before
+ *   launching any kernel that uses the ::printf() device
+ *   system call, otherwise ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitMallocHeapSize controls the size of the heap used
+ *   by the ::malloc() and ::free() device system calls.  Setting
+ *   ::cudaLimitMallocHeapSize must be performed before launching
+ *   any kernel that uses the ::malloc() or ::free() device system calls,
+ *   otherwise ::cudaErrorInvalidValue will be returned.
+ *
+ * \param limit - Limit to set
+ * \param value - Size in bytes of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetLimit
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceGetLimit(), which should be used
+ * instead.
+ *
+ * Returns in \p *pValue the current size of \p limit.  The supported
+ * ::cudaLimit values are:
+ * - ::cudaLimitStackSize: stack size of each GPU thread;
+ * - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the
+ *   ::printf() device system call.
+ * - ::cudaLimitMallocHeapSize: size of the heap used by the
+ *   ::malloc() and ::free() device system calls;
+ *
+ * \param limit  - Limit to query
+ * \param pValue - Returned size in bytes of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetLimit
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current device.
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceGetCacheConfig(), which should be 
+ * used instead.
+ * 
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this returns through \p pCacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute functions.
+ *
+ * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param pCacheConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetCacheConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current device.
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceSetCacheConfig(), which should be 
+ * used instead.
+ * 
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute the function. Any
+ * function preference set via
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
+ * or
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
+ * will be preferred over this device-wide setting. Setting the device-wide
+ * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
+ * kernel launches to prefer to not change the cache configuration unless
+ * required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+/** @} */ /* END CUDART_THREAD_DEPRECATED */
+
+/**
+ * \defgroup CUDART_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the last error from a runtime call
+ *
+ * Returns the last error that has been produced by any of the runtime calls
+ * in the same instance of the CUDA Runtime library in the host thread and
+ * resets it to ::cudaSuccess.
+ *
+ * Note: Multiple instances of the CUDA Runtime library can be present in an
+ * application when using a library that statically links the CUDA Runtime.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMissingConfiguration,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorUnmapBufferObjectFailed,
+ * ::cudaErrorInvalidDevicePointer,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorInvalidFilterSetting,
+ * ::cudaErrorInvalidNormSetting,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInsufficientDriver,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorSetOnActiveProcess,
+ * ::cudaErrorStartupFailure,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+
+/**
+ * \brief Returns the last error from a runtime call
+ *
+ * Returns the last error that has been produced by any of the runtime calls
+ * in the same instance of the CUDA Runtime library in the host thread. This
+ * call does not reset the error to ::cudaSuccess like ::cudaGetLastError().
+ *
+ * Note: Multiple instances of the CUDA Runtime library can be present in an
+ * application when using a library that statically links the CUDA Runtime.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMissingConfiguration,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorUnmapBufferObjectFailed,
+ * ::cudaErrorInvalidDevicePointer,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorInvalidFilterSetting,
+ * ::cudaErrorInvalidNormSetting,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInsufficientDriver,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorSetOnActiveProcess,
+ * ::cudaErrorStartupFailure,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+
+/**
+ * \brief Returns the string representation of an error code enum name
+ *
+ * Returns a string containing the name of an error code in the enum.  If the error
+ * code is not recognized, "unrecognized error code" is returned.
+ *
+ * \param error - Error code to convert to string
+ *
+ * \return
+ * \p char* pointer to a NULL-terminated string
+ *
+ * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError,
+ * ::cuGetErrorName
+ */
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+
+/**
+ * \brief Returns the description string for an error code
+ *
+ * Returns the description string for an error code.  If the error
+ * code is not recognized, "unrecognized error code" is returned.
+ *
+ * \param error - Error code to convert to string
+ *
+ * \return
+ * \p char* pointer to a NULL-terminated string
+ *
+ * \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError,
+ * ::cuGetErrorString
+ */
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+/** @} */ /* END CUDART_ERROR */
+
+/**
+ * \addtogroup CUDART_DEVICE 
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * or equal to 2.0 that are available for execution.
+ *
+ * \param count - Returns the number of devices with compute capability
+ * greater or equal to 2.0
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice, 
+ * ::cudaInitDevice,
+ * ::cuDeviceGetCount
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+
+/**
+ * \brief Returns information about the compute-device
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp
+ * structure is defined as:
+ * \code
+    struct cudaDeviceProp {
+        char name[256];
+        cudaUUID_t uuid;
+        size_t totalGlobalMem;
+        size_t sharedMemPerBlock;
+        int regsPerBlock;
+        int warpSize;
+        size_t memPitch;
+        int maxThreadsPerBlock;
+        int maxThreadsDim[3];
+        int maxGridSize[3];
+        int clockRate;
+        size_t totalConstMem;
+        int major;
+        int minor;
+        size_t textureAlignment;
+        size_t texturePitchAlignment;
+        int deviceOverlap;
+        int multiProcessorCount;
+        int kernelExecTimeoutEnabled;
+        int integrated;
+        int canMapHostMemory;
+        int computeMode;
+        int maxTexture1D;
+        int maxTexture1DMipmap;
+        int maxTexture1DLinear;
+        int maxTexture2D[2];
+        int maxTexture2DMipmap[2];
+        int maxTexture2DLinear[3];
+        int maxTexture2DGather[2];
+        int maxTexture3D[3];
+        int maxTexture3DAlt[3];
+        int maxTextureCubemap;
+        int maxTexture1DLayered[2];
+        int maxTexture2DLayered[3];
+        int maxTextureCubemapLayered[2];
+        int maxSurface1D;
+        int maxSurface2D[2];
+        int maxSurface3D[3];
+        int maxSurface1DLayered[2];
+        int maxSurface2DLayered[3];
+        int maxSurfaceCubemap;
+        int maxSurfaceCubemapLayered[2];
+        size_t surfaceAlignment;
+        int concurrentKernels;
+        int ECCEnabled;
+        int pciBusID;
+        int pciDeviceID;
+        int pciDomainID;
+        int tccDriver;
+        int asyncEngineCount;
+        int unifiedAddressing;
+        int memoryClockRate;
+        int memoryBusWidth;
+        int l2CacheSize;
+        int persistingL2CacheMaxSize;
+        int maxThreadsPerMultiProcessor;
+        int streamPrioritiesSupported;
+        int globalL1CacheSupported;
+        int localL1CacheSupported;
+        size_t sharedMemPerMultiprocessor;
+        int regsPerMultiprocessor;
+        int managedMemory;
+        int isMultiGpuBoard;
+        int multiGpuBoardGroupID;
+        int singleToDoublePrecisionPerfRatio;
+        int pageableMemoryAccess;
+        int concurrentManagedAccess;
+        int computePreemptionSupported;
+        int canUseHostPointerForRegisteredMem;
+        int cooperativeLaunch;
+        int cooperativeMultiDeviceLaunch;
+        int pageableMemoryAccessUsesHostPageTables;
+        int directManagedMemAccessFromHost;
+        int accessPolicyMaxWindowSize;
+    }
+ \endcode
+ * where:
+ * - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying
+ *   the device.
+ * - \ref ::cudaDeviceProp::uuid "uuid" is a 16-byte unique identifier.
+ * - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total
+ *   amount of global memory available on the device in bytes.
+ * - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the
+ *   maximum amount of shared memory available to a thread block in bytes.
+ * - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number
+ *   of 32-bit registers available to a thread block.
+ * - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads.
+ * - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in
+ *   bytes allowed by the memory copy functions that involve memory regions
+ *   allocated through ::cudaMallocPitch().
+ * - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the
+ *   maximum number of threads per block.
+ * - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the
+ *   maximum size of each dimension of a block.
+ * - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the
+ *   maximum size of each dimension of a grid.
+ * - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in
+ *   kilohertz.
+ * - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount
+ *   of constant memory available on the device in bytes.
+ * - \ref ::cudaDeviceProp::major "major",
+ *   \ref ::cudaDeviceProp::minor "minor" are the major and minor revision
+ *   numbers defining the device's compute capability.
+ * - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the
+ *   alignment requirement; texture base addresses that are aligned to
+ *   \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not
+ *   need an offset applied to texture fetches.
+ * - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the
+ *   pitch alignment requirement for 2D texture references that are bound to 
+ *   pitched memory.
+ * - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device
+ *   can concurrently copy memory between host and device while executing a
+ *   kernel, or 0 if not.  Deprecated, use instead asyncEngineCount.
+ * - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the
+ *   number of multiprocessors on the device.
+ * - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
+ *   is 1 if there is a run time limit for kernels executed on the device, or
+ *   0 if not.
+ * - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an
+ *   integrated (motherboard) GPU and 0 if it is a discrete (card) component.
+ * - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the
+ *   device can map host memory into the CUDA address space for use with
+ *   ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not.
+ * - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode
+ *   that the device is currently in. Available modes are as follows:
+ *   - cudaComputeModeDefault: Default mode - Device is not restricted and
+ *     multiple threads can use ::cudaSetDevice() with this device.
+ *   - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
+ *     ::cudaSetDevice() with this device.
+ *   - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
+ *     threads in one process will be able to use ::cudaSetDevice() with this device.
+ *   <br> When an occupied exclusive mode device is chosen with ::cudaSetDevice,
+ *   all subsequent non-device management runtime functions will return
+ *   ::cudaErrorDevicesUnavailable.
+ * - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D
+ *   texture size.
+ * - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum
+ *   1D mipmapped texture texture size.
+ * - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum
+ *   1D texture size for textures bound to linear memory.
+ * - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum
+ *   2D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the
+ *   maximum 2D mipmapped texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the 
+ *   maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
+ * - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the 
+ *   maximum 2D texture dimensions if texture gather operations have to be performed.
+ * - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum
+ *   3D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]"
+ *   contains the maximum alternate 3D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the 
+ *   maximum cubemap texture width or height.
+ * - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains
+ *   the maximum 1D layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains
+ *   the maximum 2D layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]"
+ *   contains the maximum cubemap layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D
+ *   surface size.
+ * - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum
+ *   2D surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum
+ *   3D surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains
+ *   the maximum 1D layered surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains
+ *   the maximum 2D layered surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum 
+ *   cubemap surface width or height.
+ * - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]"
+ *   contains the maximum cubemap layered surface dimensions.
+ * - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the
+ *   alignment requirements for surfaces.
+ * - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the
+ *   device supports executing multiple kernels within the same context
+ *   simultaneously, or 0 if not. It is not guaranteed that multiple kernels
+ *   will be resident on the device concurrently so this feature should not be
+ *   relied upon for correctness.
+ * - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC
+ *   support turned on, or 0 if not.
+ * - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of
+ *   the device.
+ * - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device
+ *   (sometimes called slot) identifier of the device.
+ * - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier
+ *   of the device.
+ * - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a
+ *   TCC driver or 0 if not.
+ * - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the
+ *   device can concurrently copy memory between host and device while executing
+ *   a kernel. It is 2 when the device can concurrently copy memory between host
+ *   and device in both directions and execute a kernel at the same time. It is
+ *   0 if neither of these is supported.
+ * - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device 
+ *   shares a unified address space with the host and 0 otherwise.
+ * - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory 
+ *   clock frequency in kilohertz.
+ * - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width  
+ *   in bits.
+ * - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes. 
+ * - \ref ::cudaDeviceProp::persistingL2CacheMaxSize "persistingL2CacheMaxSize" is L2 cache's maximum persisting lines size in bytes.
+ * - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor"  
+ *   is the number of maximum resident threads per multiprocessor.
+ * - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported"
+ *   is 1 if the device supports stream priorities, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported"
+ *   is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported"
+ *   is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the
+ *   maximum amount of shared memory available to a multiprocessor in bytes; this amount is
+ *   shared by all thread blocks simultaneously resident on a multiprocessor.
+ * - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number
+ *   of 32-bit registers available to a multiprocessor; this number is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor.
+ * - \ref ::cudaDeviceProp::managedMemory "managedMemory"
+ *   is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard"
+ *   is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not;
+ * - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier
+ *   for a group of devices associated with the same board.
+ *   Devices on the same multi-GPU board will share the same identifier.
+ * - \ref ::cudaDeviceProp::hostNativeAtomicSupported "hostNativeAtomicSupported"
+ *   is 1 if the link between the device and the host supports native atomic operations, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::singleToDoublePrecisionPerfRatio "singleToDoublePrecisionPerfRatio"  
+ *   is the ratio of single precision performance (in floating-point operations per second)
+ *   to double precision performance.
+ * - \ref ::cudaDeviceProp::pageableMemoryAccess "pageableMemoryAccess" is 1 if the device supports
+ *   coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::concurrentManagedAccess "concurrentManagedAccess" is 1 if the device can
+ *   coherently access managed memory concurrently with the CPU, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::computePreemptionSupported "computePreemptionSupported" is 1 if the device
+ *   supports Compute Preemption, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::canUseHostPointerForRegisteredMem "canUseHostPointerForRegisteredMem" is 1 if
+ *   the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::cooperativeLaunch "cooperativeLaunch" is 1 if the device supports launching
+ *   cooperative kernels via ::cudaLaunchCooperativeKernel, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::cooperativeMultiDeviceLaunch "cooperativeMultiDeviceLaunch" is 1 if the device
+ *   supports launching cooperative kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::sharedMemPerBlockOptin "sharedMemPerBlockOptin"
+ *   is the per device maximum shared memory per block usable by special opt in
+ * - \ref ::cudaDeviceProp::pageableMemoryAccessUsesHostPageTables "pageableMemoryAccessUsesHostPageTables" is 1 if the device accesses
+ *   pageable memory via the host's page tables, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::directManagedMemAccessFromHost "directManagedMemAccessFromHost" is 1 if the host can directly access managed
+ *   memory on the device without migration, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::maxBlocksPerMultiProcessor "maxBlocksPerMultiProcessor" is the maximum number of thread blocks
+ *   that can reside on a multiprocessor.
+ * - \ref ::cudaDeviceProp::accessPolicyMaxWindowSize "accessPolicyMaxWindowSize" is
+ *   the maximum value of ::cudaAccessPolicyWindow::num_bytes.
+ * - \ref ::cudaDeviceProp::reservedSharedMemPerBlock "reservedSharedMemPerBlock"
+ *   is the shared memory reserved by CUDA driver per block in bytes
+ * - \ref ::cudaDeviceProp::hostRegisterSupported "hostRegisterSupported"
+ *  is 1 if the device supports host memory registration via ::cudaHostRegister, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::sparseCudaArraySupported "sparseCudaArraySupported"
+ *  is 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays, 0 otherwise
+ * - \ref ::cudaDeviceProp::hostRegisterReadOnlySupported "hostRegisterReadOnlySupported"
+ *  is 1 if the device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as
+ *  read-only to the GPU
+ * - \ref ::cudaDeviceProp::timelineSemaphoreInteropSupported "timelineSemaphoreInteropSupported"
+ *  is 1 if external timeline semaphore interop is supported on the device, 0 otherwise
+ * - \ref ::cudaDeviceProp::memoryPoolsSupported "memoryPoolsSupported"
+ *  is 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, 0 otherwise
+ * - \ref ::cudaDeviceProp::gpuDirectRDMASupported "gpuDirectRDMASupported"
+ *  is 1 if the device supports GPUDirect RDMA APIs, 0 otherwise
+ * - \ref ::cudaDeviceProp::gpuDirectRDMAFlushWritesOptions "gpuDirectRDMAFlushWritesOptions"
+ *  is a bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum
+ * - \ref ::cudaDeviceProp::gpuDirectRDMAWritesOrdering "gpuDirectRDMAWritesOrdering"
+ *  See the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values
+ * - \ref ::cudaDeviceProp::memoryPoolSupportedHandleTypes "memoryPoolSupportedHandleTypes"
+ *  is a bitmask of handle types supported with mempool-based IPC
+ * - \ref ::cudaDeviceProp::deferredMappingCudaArraySupported "deferredMappingCudaArraySupported"
+ *  is 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays
+ * - \ref ::cudaDeviceProp::ipcEventSupported "ipcEventSupported"
+ *  is 1 if the device supports IPC Events, and 0 otherwise
+ * - \ref ::cudaDeviceProp::unifiedFunctionPointers "unifiedFunctionPointers"
+ *  is 1 if the device support unified pointers, and 0 otherwise
+ *
+ * \param prop   - Properties for the specified device
+ * \param device - Device number to get properties for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
+ * ::cudaDeviceGetAttribute, 
+ * ::cudaInitDevice,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *value the integer value of the attribute \p attr on device
+ * \p device. The supported attributes are:
+ * - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block
+ * - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block
+ * - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block
+ * - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block
+ * - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid
+ * - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid
+ * - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid
+ * - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory
+ *   available to a thread block in bytes
+ * - ::cudaDevAttrTotalConstantMemory: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes
+ * - ::cudaDevAttrWarpSize: Warp size in threads
+ * - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy
+ *   functions that involve memory regions allocated through ::cudaMallocPitch()
+ * - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width
+ * - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound
+ *   to linear memory
+ * - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width
+ * - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width
+ * - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height
+ * - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture
+ *   bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture
+ *   bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D
+ *   texture bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture
+ *   width
+ * - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture
+ *   height
+ * - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width
+ * - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height
+ * - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth
+ * - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or
+ *   height
+ * - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width
+ * - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered
+ *   texture
+ * - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width
+ * - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height
+ * - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered
+ *   texture
+ * - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered
+ *   texture width or height
+ * - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap
+ *   layered texture
+ * - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width
+ * - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width
+ * - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height
+ * - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width
+ * - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height
+ * - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth
+ * - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width
+ * - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered
+ *   surface
+ * - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width
+ * - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height
+ * - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered
+ *   surface
+ * - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width
+ * - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered
+ *   surface width
+ * - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap
+ *   layered surface
+ * - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers 
+ *   available to a thread block
+ * - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz
+ * - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base
+ *   addresses aligned to ::textureAlign bytes do not need an offset applied
+ *   to texture fetches
+ * - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D
+ *   texture references bound to pitched memory
+ * - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory
+ *   between host and device while executing a kernel, or 0 if not
+ * - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device
+ * - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels
+ *   executed on the device, or 0 if not
+ * - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory
+ *   subsystem, or 0 if not
+ * - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into
+ *   the CUDA address space, or 0 if not
+ * - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device
+ *   is currently in. Available modes are as follows:
+ *   - ::cudaComputeModeDefault: Default mode - Device is not restricted and
+ *     multiple threads can use ::cudaSetDevice() with this device.
+ *   - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
+ *     ::cudaSetDevice() with this device.
+ *   - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
+ *     threads in one process will be able to use ::cudaSetDevice() with this
+ *     device.
+ * - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing
+ *   multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident on the
+ *   device concurrently so this feature should not be relied upon for
+ *   correctness.
+ * - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device,
+ *   0 if error correction is disabled or not supported by the device
+ * - ::cudaDevAttrPciBusId: PCI bus identifier of the device
+ * - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of
+ *   the device
+ * - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only
+ *   available on Tesla hardware running Windows Vista or later.
+ * - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz
+ * - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits
+ * - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device
+ *   doesn't have L2 cache.
+ * - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per 
+ *   multiprocessor
+ * - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address
+ *   space with the host, or 0 if not
+ * - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version
+ *   number
+ * - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version
+ *   number
+ * - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream
+ *   priorities, or 0 if not
+ * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals 
+ *    in L1 cache, 0 if not
+ * - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals 
+ *    in L1 cache, 0 if not
+ * - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory
+ *   available to a multiprocessor in bytes; this amount is shared by all 
+ *   thread blocks simultaneously resident on a multiprocessor
+ * - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers 
+ *   available to a multiprocessor; this number is shared by all thread blocks
+ *   simultaneously resident on a multiprocessor
+ * - ::cudaDevAttrManagedMemory: 1 if device supports allocating
+ *   managed memory, 0 if not
+ * - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not
+ * - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the
+ *   same multi-GPU board
+ * - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the
+ *   host supports native atomic operations
+ * - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance
+ * - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it, and 0 otherwise
+ * - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed
+ *   memory concurrently with the CPU, and 0 otherwise
+ * - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports
+ *   Compute Preemption, 0 if not
+ * - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host
+ *   registered memory at the same virtual address as the CPU, and 0 otherwise
+ * - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels
+ *   via ::cudaLaunchCooperativeKernel, and 0 otherwise
+ * - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative
+ *   kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise
+ * - ::cudaDevAttrCanFlushRemoteWrites: 1 if the device supports flushing of outstanding 
+ *   remote writes, and 0 otherwise
+ * - ::cudaDevAttrHostRegisterSupported: 1 if the device supports host memory registration
+ *   via ::cudaHostRegister, and 0 otherwise
+ * - ::cudaDevAttrPageableMemoryAccessUsesHostPageTables: 1 if the device accesses pageable memory via the
+ *   host's page tables, and 0 otherwise
+ * - ::cudaDevAttrDirectManagedMemAccessFromHost: 1 if the host can directly access managed memory on the device
+ *   without migration, and 0 otherwise
+ * - ::cudaDevAttrMaxSharedMemoryPerBlockOptin: Maximum per block shared memory size on the device. This value can
+ *   be opted into when using ::cudaFuncSetAttribute
+ * - ::cudaDevAttrMaxBlocksPerMultiprocessor: Maximum number of thread blocks that can reside on a multiprocessor
+ * - ::cudaDevAttrMaxPersistingL2CacheSize: Maximum L2 persisting lines capacity setting in bytes
+ * - ::cudaDevAttrMaxAccessPolicyWindowSize: Maximum value of cudaAccessPolicyWindow::num_bytes
+ * - ::cudaDevAttrReservedSharedMemoryPerBlock: Shared memory reserved by CUDA driver per block in bytes
+ * - ::cudaDevAttrSparseCudaArraySupported: 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ * - ::cudaDevAttrHostRegisterReadOnlySupported: Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly
+ *   to register memory that must be mapped as read-only to the GPU
+ * - ::cudaDevAttrMemoryPoolsSupported: 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, and 0 otherwise
+ * - ::cudaDevAttrGPUDirectRDMASupported: 1 if the device supports GPUDirect RDMA APIs, and 0 otherwise
+ * - ::cudaDevAttrGPUDirectRDMAFlushWritesOptions: bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum 
+ * - ::cudaDevAttrGPUDirectRDMAWritesOrdering: see the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values
+ * - ::cudaDevAttrMemoryPoolSupportedHandleTypes: Bitmask of handle types supported with mempool based IPC
+ * - ::cudaDevAttrDeferredMappingCudaArraySupported : 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ * - ::cudaDevAttrIpcEventSupport: 1 if the device supports IPC Events.
+ *
+ * \param value  - Returned device attribute value
+ * \param attr   - Device attribute to query
+ * \param device - Device number to query 
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
+ * ::cudaGetDeviceProperties, 
+ * ::cudaInitDevice,
+ * ::cuDeviceGetAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+
+/**
+ * \brief Returns the default mempool of a device
+ *
+ * The default mempool of a device contains device memory from that device.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cudaMallocAsync, ::cudaMemPoolTrimTo, ::cudaMemPoolGetAttribute, ::cudaDeviceSetMemPool, ::cudaMemPoolSetAttribute, ::cudaMemPoolSetAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device);
+
+
+/**
+ * \brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * Unless a mempool is specified in the ::cudaMallocAsync call,
+ * ::cudaMallocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * \note Use ::cudaMallocFromPoolAsync to specify asynchronous allocations from a device different
+ * than the one the stream runs on.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidDevice
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_callback
+ *
+ * \sa ::cuDeviceSetMemPool, ::cudaDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolDestroy, ::cudaMallocFromPoolAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetMemPool(int device, cudaMemPool_t memPool);
+
+/**
+ * \brief Gets the current mempool for a device
+ *
+ * Returns the last pool provided to ::cudaDeviceSetMemPool for this device
+ * or the device's default memory pool if ::cudaDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device,
+ * otherwise the returned pool must have been set with ::cuDeviceSetMemPool or ::cudaDeviceSetMemPool.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceSetMemPool
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device);
+
+/**
+ * \brief Return NvSciSync attributes that this device can support.
+ *
+ * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ * can be used to create an NvSciSync that matches this device's capabilities.
+ * 
+ * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ * already set this API will return ::cudaErrorInvalidValue.
+ * 
+ * The applications should set \p nvSciSyncAttrList to a valid 
+ * NvSciSyncAttrList failing which this API will return
+ * ::cudaErrorInvalidHandle.
+ * 
+ * The \p flags controls how applications intends to use
+ * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ * - ::cudaNvSciSyncAttrSignal, specifies that the applications intends to 
+ * signal an NvSciSync on this CUDA device.
+ * - ::cudaNvSciSyncAttrWait, specifies that the applications intends to 
+ * wait on an NvSciSync on this CUDA device.
+ *
+ * At least one of these flags must be set, failing which the API
+ * returns ::cudaErrorInvalidValue. Both the flags are orthogonal
+ * to one another: a developer may set both these flags that allows to
+ * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+ *
+ * Note that this API updates the input \p nvSciSyncAttrList with values equivalent
+ * to the following public attribute key-values:
+ * NvSciSyncAttrKey_RequiredPerm is set to
+ * - NvSciSyncAccessPerm_SignalOnly if ::cudaNvSciSyncAttrSignal is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitOnly if ::cudaNvSciSyncAttrWait is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitSignal if both ::cudaNvSciSyncAttrWait and
+ * ::cudaNvSciSyncAttrSignal are set in \p flags.
+ * NvSciSyncAttrKey_PrimitiveInfo is set to
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid \p device.
+ * - NvSciSyncAttrValPrimitiveType_Syncpoint if \p device is a Tegra device.
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if \p device is GA10X+.
+ * NvSciSyncAttrKey_GpuId is set to the same UUID that is returned in 
+ * \p cudaDeviceProp.uuid from ::cudaDeviceGetProperties for this \p device.
+ *
+ * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ * \param device                - Valid Cuda Device to get NvSciSync attributes for.
+ * \param flags                 - flags describing NvSciSync usage.
+ *
+ * \return
+ *
+ * ::cudaSuccess,
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidHandle,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorMemoryAllocation
+ *
+ * \sa
+ * ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, int device, int flags);
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::cudaDevP2PAttrPerformanceRank: A relative value indicating the
+ *   performance of the link between two devices. Lower value means better
+ *   performance (0 being the value used for most performant link).
+ * - ::cudaDevP2PAttrAccessSupported: 1 if peer access is enabled.
+ * - ::cudaDevP2PAttrNativeAtomicSupported: 1 if native atomic operations over
+ *   the link are supported.
+ * - ::cudaDevP2PAttrCudaArrayAccessSupported: 1 if accessing CUDA arrays over
+ *   the link is supported.
+ *
+ * Returns ::cudaErrorInvalidDevice if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::cudaErrorInvalidValue if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cudaDeviceCanAccessPeer,
+ * ::cuDeviceGetP2PAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);
+
+/**
+ * \brief Select compute-device which best matches criteria
+ *
+ * Returns in \p *device the device which has properties that best match
+ * \p *prop.
+ *
+ * \param device - Device with best match
+ * \param prop   - Desired device properties
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
+ * ::cudaGetDeviceProperties, 
+ * ::cudaInitDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop);
+/**
+ * \brief Initialize device to be used for GPU executions
+ *
+ * This function will initialize the CUDA Runtime structures and primary context on \p device when called,
+ * but the context will not be made current to \p device.
+ *
+ * When ::cudaInitDeviceFlagsAreValid is set in \p flags, deviceFlags are applied to the requested device.
+ * The values of deviceFlags match those of the flags parameters in ::cudaSetDeviceFlags. 
+ * The effect may be verified by ::cudaGetDeviceFlags.
+ *
+ * This function will return an error if the device is in ::cudaComputeModeExclusiveProcess
+ * and is occupied by another process or if the device is in ::cudaComputeModeProhibited.
+ *
+ * \param device - Device on which the runtime will initialize itself.
+ * \param deviceFlags - Parameters for device operation.
+ * \param flags - Flags for controlling the device initialization.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice, ::cudaSetDevice
+ * ::cuCtxSetCurrent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags);
+/**
+ * \brief Set device to be used for GPU executions
+ *
+ * Sets \p device as the current device for the calling host thread.
+ * Valid device id's are 0 to (::cudaGetDeviceCount() - 1).
+ *
+ * Any device memory subsequently allocated from this host thread
+ * using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray()
+ * will be physically resident on \p device.  Any host memory allocated
+ * from this host thread using ::cudaMallocHost() or ::cudaHostAlloc() 
+ * or ::cudaHostRegister() will have its lifetime associated  with
+ * \p device.  Any streams or events created from this host thread will 
+ * be associated with \p device.  Any kernels launched from this host
+ * thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed
+ * on \p device.
+ *
+ * This call may be made from any host thread, to any device, and at 
+ * any time.  This function will do no synchronization with the previous 
+ * or new device, 
+ * and should only take significant time when it initializes the runtime's context state.
+ * This call will bind the primary context of the specified device to the calling thread and all the
+ * subsequent memory allocations, stream and event creations, and kernel launches
+ * will be associated with the primary context. 
+ * This function will also immediately initialize the runtime state on the primary context, 
+ * and the context will be current on \p device immediately. This function will return an 
+ * error if the device is in ::cudaComputeModeExclusiveProcess and is occupied by another 
+ * process or if the device is in ::cudaComputeModeProhibited.
+ * 
+ * It is not required to call ::cudaInitDevice before using this function.
+ * \param device - Device on which the active host thread should execute the
+ * device code.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorDeviceUnavailable,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice,
+ * ::cudaInitDevice,
+ * ::cuCtxSetCurrent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device);
+
+/**
+ * \brief Returns which device is currently being used
+ *
+ * Returns in \p *device the current device for the calling host thread.
+ *
+ * \param device - Returns the device on which the active host thread
+ * executes the device code.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorDeviceUnavailable,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice,
+ * ::cuCtxGetCurrent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+
+/**
+ * \brief Set a list of devices that can be used for CUDA
+ *
+ * Sets a list of devices for CUDA execution in priority order using
+ * \p device_arr. The parameter \p len specifies the number of elements in the
+ * list.  CUDA will try devices from the list sequentially until it finds one
+ * that works.  If this function is not called, or if it is called with a \p len
+ * of 0, then CUDA will go back to its default behavior of trying devices
+ * sequentially from a default list containing all of the available CUDA
+ * devices in the system. If a specified device ID in the list does not exist,
+ * this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and
+ * \p device_arr is NULL or if \p len exceeds the number of devices in
+ * the system, then ::cudaErrorInvalidValue is returned.
+ *
+ * \param device_arr - List of devices to try
+ * \param len        - Number of devices in specified list
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDeviceFlags,
+ * ::cudaChooseDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len);
+
+/**
+ * \brief Sets flags to be used for device executions
+ * 
+ * Records \p flags as the flags for the current device. If the current device
+ * has been set and that device has already been initialized, the previous flags
+ * are overwritten. If the current device has not been initialized, it is
+ * initialized with the provided flags. If no device has been made current to
+ * the calling thread, a default device is selected and initialized with the
+ * provided flags.
+ * 
+ * The two LSBs of the \p flags parameter can be used to control how the CPU
+ * thread interacts with the OS scheduler when waiting for results from the
+ * device.
+ *
+ * - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is
+ * zero, uses a heuristic based on the number of active CUDA contexts in the
+ * process \p C and the number of logical processors in the system \p P. If
+ * \p C \> \p P, then CUDA will yield to other OS threads when waiting for the
+ * device, otherwise CUDA will not yield while waiting for results and
+ * actively spin on the processor. Additionally, on Tegra devices,
+ * ::cudaDeviceScheduleAuto uses a heuristic based on the power profile of
+ * the platform and may choose ::cudaDeviceScheduleBlockingSync for low-powered
+ * devices.
+ * - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for
+ * results from the device. This can decrease latency when waiting for the
+ * device, but may lower the performance of CPU threads if they are performing
+ * work in parallel with the CUDA thread.
+ * - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting
+ * for results from the device. This can increase latency when waiting for the
+ * device, but can increase the performance of CPU threads performing work in
+ * parallel with the device.
+ * - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread 
+ * on a synchronization primitive when waiting for the device to finish work.
+ * - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a 
+ * synchronization primitive when waiting for the device to finish work. <br>
+ * \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and
+ * replaced with ::cudaDeviceScheduleBlockingSync.
+ * - ::cudaDeviceMapHost: This flag enables allocating pinned
+ * host memory that is accessible to the device. It is implicit for the
+ * runtime but may be absent if a context is created using the driver API.
+ * If this flag is not set, ::cudaHostGetDevicePointer() will always return
+ * a failure code.
+ * - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * \ref deprecated "Deprecated:" This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * - ::cudaDeviceSyncMemops: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * \param flags - Parameters for device operation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDevice, ::cudaSetValidDevices,
+ * ::cudaInitDevice,
+ * ::cudaChooseDevice,
+ * ::cuDevicePrimaryCtxSetFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags );
+
+/**
+ * \brief Gets the flags for the current device
+ *
+ * 
+ * Returns in \p flags the flags for the current device. If there is a current
+ * device for the calling thread, the flags for the device are returned. If
+ * there is no current device, the flags for the first device are returned,
+ * which may be the default flags.  Compare to the behavior of
+ * ::cudaSetDeviceFlags.
+ *
+ * Typically, the flags returned should match the behavior that will be seen
+ * if the calling thread uses a device after this call, without any change to
+ * the flags or current device inbetween by this or another thread.  Note that
+ * if the device is not initialized, it is possible for another thread to
+ * change the flags for the current device before it is initialized.
+ * Additionally, when using exclusive mode, if this thread has not requested a
+ * specific device, it may use a device other than the first device, contrary
+ * to the assumption made by this function.
+ *
+ * If a context has been created via the driver API and is current to the
+ * calling thread, the flags for that context are always returned.
+ *
+ * Flags returned by this function may specifically include ::cudaDeviceMapHost
+ * even though it is not accepted by ::cudaSetDeviceFlags because it is
+ * implicit in runtime API flags.  The reason for this is that the current
+ * context may have been created via the driver API in which case the flag is
+ * not implicit and may be unset.
+ *
+ * \param flags - Pointer to store the device flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDevice, ::cudaSetDeviceFlags,
+ * ::cudaInitDevice,
+ * ::cuCtxGetFlags,
+ * ::cuDevicePrimaryCtxGetState
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags );
+/** @} */ /* END CUDART_DEVICE */
+
+/**
+ * \defgroup CUDART_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create an asynchronous stream
+ *
+ * Creates a new asynchronous stream.
+ *
+ * \param pStream - Pointer to new stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamGetFlags,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
+
+/**
+ * \brief Create an asynchronous stream
+ *
+ * Creates a new asynchronous stream.  The \p flags argument determines the 
+ * behaviors of the stream.  Valid values for \p flags are
+ * - ::cudaStreamDefault: Default stream creation flag.
+ * - ::cudaStreamNonBlocking: Specifies that work running in the created 
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param pStream - Pointer to new stream identifier
+ * \param flags   - Parameters for stream creation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithPriority,
+ * ::cudaStreamGetFlags,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreate
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+
+/**
+ * \brief Create an asynchronous stream with the specified priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p pStream.
+ * This affects the scheduling priority of work in the stream. Priorities provide a
+ * hint to preferentially run work with higher priority when possible, but do
+ * not preempt already-running work or provide any other functional guarantee on
+ * execution order.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param pStream  - Pointer to new stream identifier
+ * \param flags    - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed
+ * \param priority - Priority of the stream. Lower numbers represent higher priorities.
+ *                   See ::cudaDeviceGetStreamPriorityRange for more information about
+ *                   the meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaDeviceGetStreamPriorityRange,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreateWithPriority
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority);
+
+/**
+ * \brief Query the priority of a stream
+ *
+ * Query the priority of a stream. The priority is returned in in \p priority.
+ * Note that if the stream was created with a priority outside the meaningful
+ * numerical range returned by ::cudaDeviceGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cudaStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaDeviceGetStreamPriorityRange,
+ * ::cudaStreamGetFlags,
+ * ::cuStreamGetPriority
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
+
+/**
+ * \brief Query the flags of a stream
+ *
+ * Query the flags of a stream. The flags are returned in \p flags.
+ * See ::cudaStreamCreateWithFlags for a list of valid flags.
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param flags   - Pointer to an unsigned integer in which the stream's flags are returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cuStreamGetFlags
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
+
+/**
+ * \brief Query the Id of a stream
+ *
+ * Query the Id of a stream. The Id is returned in \p streamId.
+ * The Id is unique for the life of the program.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA runtime APIs such as ::cudaStreamCreate, 
+ *   ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority, or their driver 
+ *   API equivalents such as ::cuStreamCreate or ::cuStreamCreateWithPriority.
+ *   Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::cudaStreamLegacy 
+ *   and ::cudaStreamPerThread respectively.  The driver API equivalents of these 
+ *   are also accepted which are NULL, ::CU_STREAM_LEGACY and ::CU_STREAM_PER_THREAD.</li>
+ * </ul>
+ * 
+ * \param hStream    - Handle to the stream to be queried
+ * \param streamId   - Pointer to an unsigned long long in which the stream Id is returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamGetFlags,
+ * ::cuStreamGetId
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetId(cudaStream_t hStream, unsigned long long *streamId);
+
+/**
+ * \brief Resets all persisting lines in cache to normal status.
+ *
+ * Resets all persisting lines in cache to normal status.
+ * Takes effect on function return.
+ *
+ * \return
+ * ::cudaSuccess,
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void);
+
+/**
+ * \brief Copies attributes from source stream to destination stream.
+ *
+ * Copies attributes from source stream \p src to destination stream \p dst.
+ * Both streams must have the same context.
+ *
+ * \param[out] dst Destination stream
+ * \param[in] src Source stream
+ * For attributes see ::cudaStreamAttrID
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
+
+ /**
+ * \brief Queries stream attribute.
+ *
+ * Queries attribute \p attr from \p hStream and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hStream
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetAttribute(
+        cudaStream_t hStream, cudaStreamAttrID attr,
+        cudaStreamAttrValue *value_out);
+
+ /**
+ * \brief Sets stream attribute.
+ *
+ * Sets attribute \p attr on \p hStream from corresponding attribute of
+ * \p value. The updated attribute will be applied to subsequent work
+ * submitted to the stream. It will not affect previously submitted work.
+ *
+ * \param[out] hStream
+ * \param[in] attr
+ * \param[in] value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamSetAttribute(
+        cudaStream_t hStream, cudaStreamAttrID attr,
+        const cudaStreamAttrValue *value);
+
+ /**
+ * \brief Destroys and cleans up an asynchronous stream
+ *
+ * Destroys and cleans up the asynchronous stream specified by \p stream.
+ *
+ * In case the device is still doing work in the stream \p stream
+ * when ::cudaStreamDestroy() is called, the function will return immediately 
+ * and the resources associated with \p stream will be released automatically 
+ * once the device has completed all work in \p stream.
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamAddCallback,
+ * ::cuStreamDestroy
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p stream wait for all work captured in
+ * \p event.  See ::cudaEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p event may be from a different device than \p stream.
+ *
+ * flags include:
+ * - ::cudaEventWaitDefault: Default event creation flag.
+ * - ::cudaEventWaitExternal: Event is captured in the graph as an external
+ *   event node when performing stream capture.
+ *
+ * \param stream - Stream to wait
+ * \param event  - Event to wait on
+ * \param flags  - Parameters for the operation(See above)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamWaitEvent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0));
+
+/**
+ * Type of stream callback functions.
+ * \param stream The stream as passed to ::cudaStreamAddCallback, may be NULL.
+ * \param status ::cudaSuccess or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cudaLaunchHostFunc. Additionally, this function is not
+ * supported with ::cudaStreamBeginCapture and ::cudaStreamEndCapture, unlike
+ * ::cudaLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each 
+ * cudaStreamAddCallback call, a callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::cudaSuccess or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::cudaError_t.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use CUDA APIs
+ * may result in ::cudaErrorNotPermitted.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding callbacks have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if it has been properly ordered with an
+ *   event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param stream   - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync,
+ * ::cudaLaunchHostFunc, ::cuStreamAddCallback
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream,
+        cudaStreamCallback_t callback, void *userData, unsigned int flags);
+
+/**
+ * \brief Waits for stream tasks to complete
+ *
+ * Blocks until \p stream has completed all operations. If the
+ * ::cudaDeviceScheduleBlockingSync flag was set for this device, 
+ * the host thread will block until the stream is finished with 
+ * all of its tasks.
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
+
+/**
+ * \brief Queries an asynchronous stream for completion status
+ *
+ * Returns ::cudaSuccess if all operations in \p stream have
+ * completed, or ::cudaErrorNotReady if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::cudaSuccess
+ * is equivalent to having called ::cudaStreamSynchronize().
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamQuery
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p stream to specify stream association of
+ * \p length bytes of memory starting from \p devPtr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p devPtr must point to an one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cudaMallocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::cudaDevAttrPageableMemoryAccess.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
+ * The default value for \p flags is ::cudaMemAttachSingle
+ * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
+ * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p stream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p stream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region. 
+ *
+ * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p stream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cudaMallocManaged. For __managed__ variables, the default
+ * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param stream  - Stream in which to enqueue the attach operation
+ * \param devPtr  - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  memory)
+ * \param length  - Length of memory (defaults to zero)
+ * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged,
+ * ::cuStreamAttachMemAsync
+ */
+#if defined(__cplusplus)
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags = cudaMemAttachSingle);
+#else
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags);
+#endif
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p stream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cudaStreamEndCapture. Capture may not be initiated
+ * if \p stream is ::cudaStreamLegacy. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cudaStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cudaStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::cudaStreamCaptureModeRelaxed, ::cudaStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \param stream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cudaThreadExchangeStreamCaptureMode.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamEndCapture,
+ * ::cudaThreadExchangeStreamCaptureMode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode);
+
+/**
+ * \brief Begins graph capture on a stream to an existing graph
+ *
+ * Begin graph capture on \p stream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * \p graph, which will be returned via ::cudaStreamEndCapture.
+ *
+ * Capture may not be initiated if \p stream is ::cudaStreamLegacy. Capture must be ended on the
+ * same stream in which it was initiated, and it may only be initiated if the stream is not
+ * already in capture mode. The capture mode may be queried via ::cudaStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cudaStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::cudaStreamCaptureModeRelaxed, ::cudaStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \param stream          - Stream in which to initiate capture.
+ * \param graph           - Graph to capture into.
+ * \param dependencies    - Dependencies of the first node captured in the stream.  Can be NULL if numDependencies is 0.
+ * \param dependencyData  - Optional array of data associated with each dependency.
+ * \param numDependencies - Number of dependencies.
+ * \param mode            - Controls the interaction of this capture sequence with other API
+ *                          calls that are potentially unsafe. For more details see
+ *                          ::cudaThreadExchangeStreamCaptureMode.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamEndCapture,
+ * ::cudaThreadExchangeStreamCaptureMode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, enum cudaStreamCaptureMode mode);
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     cudaStreamCaptureMode mode = desiredMode;
+     cudaThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cudaThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cudaStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cudaStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cudaStreamBeginCapture-::cudaStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cudaStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p cudaStreamCaptureModeGlobal: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p cudaStreamCaptureModeRelaxed at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p cudaStreamCaptureModeGlobal,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture
+ *   sequence not initiated with \p cudaStreamCaptureModeRelaxed, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p cudaStreamCaptureModeRelaxed: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cudaEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture
+ */
+extern __host__ cudaError_t CUDARTAPI cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode);
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p stream, returning the captured graph via \p pGraph.
+ * Capture must have been initiated on \p stream via a call to ::cudaStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * If the \p mode argument to ::cudaStreamBeginCapture was not
+ * ::cudaStreamCaptureModeRelaxed, this call must be from the same thread as
+ * ::cudaStreamBeginCapture.
+ *
+ * \param stream - Stream to query
+ * \param pGraph - The captured graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureWrongThread
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing,
+ * ::cudaGraphDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p stream via \p pCaptureStatus. After a successful
+ * call, \p *pCaptureStatus will contain one of the following:
+ * - ::cudaStreamCaptureStatusNone: The stream is not capturing.
+ * - ::cudaStreamCaptureStatusActive: The stream is capturing.
+ * - ::cudaStreamCaptureStatusInvalidated: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cudaStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p stream.
+ *
+ * Note that, if this is called on ::cudaStreamLegacy (the "null stream") while
+ * a blocking stream on the same device is capturing, it will return
+ * ::cudaErrorStreamCaptureImplicit and \p *pCaptureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param stream         - Stream to query
+ * \param pCaptureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamEndCapture
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
+
+/**
+ * \brief Query a stream's capture state
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::cudaStreamLegacy (the "null stream") while a stream not created 
+ * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns cudaSuccess
+ * - the returned capture status is ::cudaStreamCaptureStatusActive
+ *
+ * \param stream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cudaStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cudaStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::cudaErrorStreamCaptureUnjoined.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamGetCaptureInfo_v3,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamUpdateCaptureDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0));
+
+/**
+ * \brief Query a stream's capture state (12.3+)
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::cudaStreamLegacy (the "null stream") while a stream not created 
+ * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns cudaSuccess
+ * - the returned capture status is ::cudaStreamCaptureStatusActive
+ *
+ * If \p edgeData_out is non-NULL then \p dependencies_out must be as well. If
+ * \p dependencies_out is non-NULL and \p edgeData_out is NULL, but there is non-zero edge
+ * data for one or more of the current stream dependencies, the call will return
+ * ::cudaErrorLossyQuery.
+ *
+ * \param stream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cudaStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cudaStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::cudaErrorStreamCaptureUnjoined.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param edgeData_out - Optional location to store a pointer to an array of graph edge
+ *           data. This array parallels \c dependencies_out; the next node to be added
+ *           has an edge to \c dependencies_out[i] with annotation \c edgeData_out[i] for
+ *           each \c i. The array pointer is valid until the next API call which operates
+ *           on the stream or until the capture is terminated.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit,
+ * ::cudaErrorLossyQuery
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamUpdateCaptureDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v3(cudaStream_t stream,
+    enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0),
+    cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0),
+    const cudaGraphEdgeData **edgeData_out __dv(0), size_t *numDependencies_out __dv(0));
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (11.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::cudaStreamAddCaptureDependencies and
+ * ::cudaStreamSetCaptureDependencies. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::cudaStreamAddCaptureDependencies.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::cudaErrorStreamCaptureUnjoined if they are unreachable from the stream at
+ * ::cudaStreamEndCapture.
+ *
+ * Returns ::cudaErrorIllegalState if the stream is not capturing.
+ *
+ * This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ * versions of the CUDA driver to 11.0 should not use this API or provide a fallback.
+ *
+ * \param stream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorIllegalState
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamGetCaptureInfo,
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0));
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (12.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::cudaStreamAddCaptureDependencies and
+ * ::cudaStreamSetCaptureDependencies. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::cudaStreamAddCaptureDependencies.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::cudaErrorStreamCaptureUnjoined if they are unreachable from the stream at
+ * ::cudaStreamEndCapture.
+ *
+ * Returns ::cudaErrorIllegalState if the stream is not capturing.
+ *
+ * \param stream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param dependencyData - Optional array of data associated with each dependency.
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorIllegalState
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamGetCaptureInfo,
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags __dv(0));
+/** @} */ /* END CUDART_STREAM */
+
+/**
+ * \defgroup CUDART_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event object
+ *
+ * Creates an event object for the current device using ::cudaEventDefault.
+ *
+ * \param event - Newly created event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cuEventCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);
+
+/**
+ * \brief Creates an event object with the specified flags
+ *
+ * Creates an event object for the current device with the specified flags. Valid
+ * flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that event should use blocking
+ *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
+ *   on an event created with this flag will block until the event actually
+ *   completes.
+ * - ::cudaEventDisableTiming: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::cudaEventBlockingSync flag not specified will provide the best
+ *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
+ * - ::cudaEventInterprocess: Specifies that the created event may be used as an
+ *   interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must
+ *   be specified along with ::cudaEventDisableTiming.
+ *
+ * \param event - Newly created event
+ * \param flags - Flags for new event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cuEventCreate
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p event the contents of \p stream at the time of this call.
+ * \p event and \p stream must be on the same CUDA context.
+ * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p stream after this call do not modify \p event. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cudaEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cudaStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cudaEventRecord(). Before the first call to ::cudaEventRecord(), an
+ * event represents an empty set of work, so for example ::cudaEventQuery()
+ * would return ::cudaSuccess.
+ *
+ * \param event  - Event to record
+ * \param stream - Stream in which to record event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cuEventRecord
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p event the contents of \p stream at the time of this call.
+ * \p event and \p stream must be on the same CUDA context.
+ * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p stream after this call do not modify \p event. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cudaEventRecordWithFlags() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cudaStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cudaEventRecordWithFlags(). Before the first call to ::cudaEventRecordWithFlags(), an
+ * event represents an empty set of work, so for example ::cudaEventQuery()
+ * would return ::cudaSuccess.
+ *
+ * flags include:
+ * - ::cudaEventRecordDefault: Default event creation flag.
+ * - ::cudaEventRecordExternal: Event is captured in the graph as an external
+ *   event node when performing stream capture.
+ *
+ * \param event  - Event to record
+ * \param stream - Stream in which to record event
+ * \param flags  - Parameters for the operation(See above)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cudaEventRecord,
+ * ::cuEventRecord,
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0));
+#endif
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p event. See
+ * ::cudaEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::cudaSuccess if all captured work has been completed, or
+ * ::cudaErrorNotReady if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::cudaSuccess
+ * is equivalent to having called ::cudaEventSynchronize().
+ *
+ * \param event - Event to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cuEventQuery
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p event.
+ * See ::cudaEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::cudaEventBlockingSync
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::cudaEventBlockingSync flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param event - Event to wait for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord,
+ * ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cuEventSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event);
+
+/**
+ * \brief Destroys an event object
+ *
+ * Destroys the event specified by \p event.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cudaEventQuery() would return ::cudaErrorNotReady). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param event - Event to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime,
+ * ::cuEventDestroy
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+
+/**
+ * \brief Computes the elapsed time between events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cudaEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cudaEventRecord() has not been called on either event, then
+ * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been
+ * called on both events but one or both of them has not yet been completed
+ * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one
+ * of the events), ::cudaErrorNotReady is returned. If either event was created
+ * with the ::cudaEventDisableTiming flag, then this function will return
+ * ::cudaErrorInvalidResourceHandle.
+ *
+ * \param ms    - Time between \p start and \p end in ms
+ * \param start - Starting event
+ * \param end   - Ending event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_null_event
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord,
+ * ::cuEventElapsedTime
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end);
+
+/** @} */ /* END CUDART_EVENT */
+
+/**
+ * \defgroup CUDART_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::cudaExternalMemoryHandleDesc structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryHandleDesc_st {
+            cudaExternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void *nvSciBufObject;
+            } handle;
+            unsigned long long size;
+            unsigned int flags;
+        } cudaExternalMemoryHandleDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryHandleDesc::type specifies the type
+ * of handle being imported. ::cudaExternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum cudaExternalMemoryHandleType_enum {
+            cudaExternalMemoryHandleTypeOpaqueFd         = 1,
+            cudaExternalMemoryHandleTypeOpaqueWin32      = 2,
+            cudaExternalMemoryHandleTypeOpaqueWin32Kmt   = 3,
+            cudaExternalMemoryHandleTypeD3D12Heap        = 4,
+            cudaExternalMemoryHandleTypeD3D12Resource    = 5,
+	        cudaExternalMemoryHandleTypeD3D11Resource    = 6,
+		    cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+            cudaExternalMemoryHandleTypeNvSciBuf         = 8
+        } cudaExternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueFd, then
+ * ::cudaExternalMemoryHandleDesc::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueWin32, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt, then
+ * ::cudaExternalMemoryHandleDesc::handle::win32::handle must
+ * be non-NULL and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D12Heap, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D12Resource, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D11Resource,then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle is    
+ * not NULL, then it must represent a valid shared NT handle that is  
+ * returned by  IDXGIResource1::CreateSharedHandle when referring to a 
+ * ID3D11Resource object. If
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D11Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt, then
+ * ::cudaExternalMemoryHandleDesc::handle::win32::handle must
+ * be non-NULL and ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a valid shared KMT
+ * handle that is returned by IDXGIResource::GetSharedHandle when
+ * referring to a ID3D11Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeNvSciBuf, then
+ * ::cudaExternalMemoryHandleDesc::handle::nvSciBufObject must be NON-NULL
+ * and reference a valid NvSciBuf object.
+ * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ * application must use ::cudaWaitExternalSemaphoresAsync or ::cudaSignalExternalSemaphoresAsync
+ * as approprriate barriers to maintain coherence between CUDA and the other drivers.
+ * See ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync and ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync 
+ * for memory synchronization.
+ *
+ * The size of the memory object must be specified in
+ * ::cudaExternalMemoryHandleDesc::size.
+ *
+ * Specifying the flag ::cudaExternalMemoryDedicated in
+ * ::cudaExternalMemoryHandleDesc::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ * This flag must be set if ::cudaExternalMemoryHandleDesc::type
+ * is one of the following:
+ * ::cudaExternalMemoryHandleTypeD3D12Resource
+ * ::cudaExternalMemoryHandleTypeD3D11Resource
+ * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ *
+ * \sa ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(cudaExternalMemory_t *extMem_out, const struct cudaExternalMemoryHandleDesc *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::cudaExternalMemoryBufferDesc structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryBufferDesc_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } cudaExternalMemoryBufferDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryBufferDesc::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::cudaExternalMemoryBufferDesc::size is the size of the buffer.
+ * ::cudaExternalMemoryBufferDesc::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * The returned pointer \p devPtr must be freed using ::cudaFree.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(void **devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::cudaExternalMemoryMipmappedArrayDesc is defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryMipmappedArrayDesc_st {
+            unsigned long long offset;
+            cudaChannelFormatDesc formatDesc;
+            cudaExtent extent;
+            unsigned int flags;
+            unsigned int numLevels;
+        } cudaExternalMemoryMipmappedArrayDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryMipmappedArrayDesc::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::cudaExternalMemoryMipmappedArrayDesc::formatDesc describes the
+ * format of the data.
+ * ::cudaExternalMemoryMipmappedArrayDesc::extent specifies the
+ * dimensions of the base level of the mipmap chain.
+ * ::cudaExternalMemoryMipmappedArrayDesc::flags are flags associated
+ * with CUDA mipmapped arrays. For further details, please refer to
+ * the documentation for ::cudaMalloc3DArray. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::cudaArrayColorAttachment must be specified in 
+ * ::cudaExternalMemoryMipmappedArrayDesc::flags.
+ * ::cudaExternalMemoryMipmappedArrayDesc::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * The returned CUDA mipmapped array must be freed using ::cudaFreeMipmappedArray.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer
+ *
+ * \note If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeNvSciBuf, then
+ * ::cudaExternalMemoryMipmappedArrayDesc::numLevels must not be greater than 1.
+ */
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc);
+
+/**
+ * \brief Destroys an external memory object.
+ *
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cudaFree and
+ * ::cudaFreeMipmappedArray respectively.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalMemory(cudaExternalMemory_t extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::cudaExternalSemaphoreHandleDesc is defined
+ * as follows:
+ *
+ * \code
+        typedef struct cudaExternalSemaphoreHandleDesc_st {
+            cudaExternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void* NvSciSyncObj;
+            } handle;
+            unsigned int flags;
+        } cudaExternalSemaphoreHandleDesc;
+ * \endcode
+ *
+ * where ::cudaExternalSemaphoreHandleDesc::type specifies the type of
+ * handle being imported. ::cudaExternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum cudaExternalSemaphoreHandleType_enum {
+            cudaExternalSemaphoreHandleTypeOpaqueFd                = 1,
+            cudaExternalSemaphoreHandleTypeOpaqueWin32             = 2,
+            cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt          = 3,
+            cudaExternalSemaphoreHandleTypeD3D12Fence              = 4,
+            cudaExternalSemaphoreHandleTypeD3D11Fence              = 5,
+            cudaExternalSemaphoreHandleTypeNvSciSync               = 6,
+            cudaExternalSemaphoreHandleTypeKeyedMutex              = 7,
+            cudaExternalSemaphoreHandleTypeKeyedMutexKmt           = 8,
+            cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd     = 9,
+            cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32  = 10
+        } cudaExternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file
+ * descriptor referencing a synchronization object. Ownership of the
+ * file descriptor is transferred to the CUDA driver when the handle
+ * is imported successfully. Performing any operations on the file
+ * descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is
+ * not NULL, then it must name a valid synchronization object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be
+ * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D11Fence::CreateSharedHandle. If 
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D11Fence object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeNvSciSync, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::nvSciSyncObj
+ * represents a valid NvSciSyncObj.
+ *
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it represent a valid shared NT handle that
+ * is returned by IDXGIResource1::CreateSharedHandle when referring to
+ * a IDXGIKeyedMutex object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be
+ * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must represent a valid KMT
+ * handle that is returned by IDXGIResource::GetSharedHandle when
+ * referring to a IDXGIKeyedMutex object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file
+ * descriptor referencing a synchronization object. Ownership of the
+ * file descriptor is transferred to the CUDA driver when the handle
+ * is imported successfully. Performing any operations on the file
+ * descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is
+ * not NULL, then it must name a valid synchronization object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(cudaExternalSemaphore_t *extSem_out, const struct cudaExternalSemaphoreHandleDesc *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence,
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+ * then the semaphore will be set to the value specified in
+ * ::cudaExternalSemaphoreSignalParams::params::fence::value.
+ *
+ * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync
+ * this API sets ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence to a
+ * value that can be used by subsequent waiters of the same NvSciSync object to
+ * order operations with those currently submitted in \p stream. Such an update
+ * will overwrite previous contents of
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence. By default,
+ * signaling such an external semaphore object causes appropriate memory synchronization
+ * operations to be performed over all the external memory objects that are imported as
+ * ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that any subsequent accesses
+ * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ * These operations can be skipped by specifying the flag
+ * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrSignal, this API will return
+ * cudaErrorNotSupported.
+ * 
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence associated with 
+ * semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync can be 
+ * deterministic. For this the NvSciSyncAttrList used to create the semaphore object 
+ * must have value of NvSciSyncAttrKey_RequireDeterministicFences key set to true. 
+ * Deterministic fences allow users to enqueue a wait over the semaphore object even 
+ * before corresponding signal is enqueued. For such a semaphore object, CUDA guarantees 
+ * that each signal operation will increment the fence value by '1'. Users are expected 
+ * to track count of signals enqueued on the semaphore object and insert waits accordingly. 
+ * When such a semaphore object is signaled from multiple streams, due to concurrent 
+ * stream execution, it is possible that the order in which the semaphore gets signaled 
+ * is indeterministic. This could lead to waiters of the semaphore getting unblocked 
+ * incorrectly. Users are expected to handle such situations, either by not using the 
+ * same semaphore object with deterministic fence support enabled in different streams 
+ * or by adding explicit dependency amongst such streams so that the semaphore is 
+ * signaled in order.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex,
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt,
+ * then the keyed mutex will be released with the key specified in
+ * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream     - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence,
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+ * then waiting on the semaphore will wait until the value of the
+ * semaphore is greater than or equal to
+ * ::cudaExternalSemaphoreWaitParams::params::fence::value.
+ *
+ * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync
+ * then, waiting on the semaphore will wait until the
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence is signaled by the
+ * signaler of the NvSciSyncObj that was associated with this semaphore object.
+ * By default, waiting on such an external semaphore object causes appropriate
+ * memory synchronization operations to be performed over all external memory objects
+ * that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that
+ * any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ * object(s) are coherent. These operations can be skipped by specifying the flag
+ * ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrWait, this API will return
+ * cudaErrorNotSupported.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex,
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt,
+ * then the keyed mutex will be acquired when it is released with the key specified 
+ * in ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key or
+ * until the timeout specified by
+ * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::timeoutMs
+ * has lapsed. The timeout interval can either be a finite value
+ * specified in milliseconds or an infinite value. In case an infinite
+ * value is specified the timeout never elapses. The windows INFINITE
+ * macro must be used to specify infinite timeout
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * ::cudaErrorTimeout
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem);
+
+/** @} */ /* END CUDART_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDART_EXECUTION Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory
+ * \param stream      - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * ::cuLaunchKernel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Note that the functionally equivalent variadic template ::cudaLaunchKernelEx
+ * is available for C++11 and newer.
+ *
+ * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x
+ * &times; \p config->gridDim.y &times; \p config->gridDim.z) grid of blocks.
+ * Each block contains \p config->blockDim (\p config->blockDim.x &times;
+ * \p config->blockDim.y &times; \p config->blockDim.z) threads.
+ *
+ * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
+ * will be available to each thread block.
+ *
+ * \p config->stream specifies a stream the invocation is associated to.
+ *
+ * Configuration beyond grid and block dimensions, dynamic shared memory size,
+ * and stream can be provided with the following two fields of \p config:
+ *
+ * \p config->attrs is an array of \p config->numAttrs contiguous
+ * ::cudaLaunchAttribute elements. The value of this pointer is not considered
+ * if \p config->numAttrs is zero. However, in that case, it is recommended to
+ * set the pointer to NULL.                                  
+ * \p config->numAttrs is the number of attributes populating the first
+ * \p config->numAttrs positions of the \p config->attrs array.
+ *
+ * If the kernel has N parameters the \p args should point to array of N
+ * pointers. Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point
+ * to the region of memory from which the actual parameter will be copied.
+ *
+ * N.B. This function is so named to avoid unintentionally invoking the
+ *      templated version, \p cudaLaunchKernelEx, for kernels taking a single
+ *      void** or void* parameter.
+ *
+ * \param config - Launch configuration
+ * \param func   - Kernel to launch
+ * \param args   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchKernelEx(const cudaLaunchConfig_t *config, void (*kernel)(ExpTypes...), ActTypes &&... args) "cudaLaunchKernelEx (C++ API)",
+ * ::cuLaunchKernelEx
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args);
+
+/**
+ * \brief Launches a device function where thread blocks can cooperate and synchronize as they execute
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::cudaDevAttrCooperativeLaunch.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory
+ * \param stream      - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorCooperativeLaunchTooLarge,
+ * ::cudaErrorSharedObjectInitFailed
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)",
+ * ::cudaLaunchCooperativeKernelMultiDevice,
+ * ::cuLaunchCooperativeKernel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+/**
+ * \brief Launches device functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * \deprecated This function is deprecated as of CUDA 11.3.
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::cudaDevAttrCooperativeMultiDeviceLaunch.
+ *
+ * The same kernel must be launched on all devices. Note that any __device__ or __constant__
+ * variables are independently instantiated on every device. It is the application's
+ * responsiblity to ensure these variables are initialized and used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves and the
+ * amount of shared memory used by each thread block must also match across all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cudaStreamCreate
+ * or ::cudaStreamCreateWithPriority or ::cudaStreamCreateWithPriority. The NULL stream or
+ * ::cudaStreamLegacy or ::cudaStreamPerThread cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::cudaLaunchParams structure is defined as:
+ * \code
+        struct cudaLaunchParams
+        {
+            void *func;
+            dim3 gridDim;
+            dim3 blockDim;
+            void **args;
+            size_t sharedMem;
+            cudaStream_t stream;
+        };
+ * \endcode
+ * where:
+ * - ::cudaLaunchParams::func specifies the kernel to be launched. This same functions must
+ *   be launched on all devices. For templated functions, pass the function symbol as follows:
+ *   func_name<template_arg_0,...,template_arg_N>
+ * - ::cudaLaunchParams::gridDim specifies the width, height and depth of the grid in blocks.
+ *   This must match across all kernels launched.
+ * - ::cudaLaunchParams::blockDim is the width, height and depth of each thread block. This
+ *   must match across all kernels launched.
+ * - ::cudaLaunchParams::args specifies the arguments to the kernel. If the kernel has
+ *   N parameters then ::cudaLaunchParams::args should point to array of N pointers. Each
+ *   pointer, from <tt>::cudaLaunchParams::args[0]</tt> to <tt>::cudaLaunchParams::args[N - 1]</tt>,
+ *   point to the region of memory from which the actual parameter will be copied.
+ * - ::cudaLaunchParams::sharedMem is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::cudaLaunchParams::stream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::cudaStreamLegacy or ::cudaStreamPerThread.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::cudaCooperativeLaunchMultiDeviceNoPreSync. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::cudaCooperativeLaunchMultiDeviceNoPostSync. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorCooperativeLaunchTooLarge,
+ * ::cudaErrorSharedObjectInitFailed
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)",
+ * ::cudaLaunchCooperativeKernel,
+ * ::cuLaunchCooperativeKernelMultiDevice
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags  __dv(0));
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache configuration
+ * for the function specified via \p func. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute \p func.
+ *
+ * \p func is a device function symbol and must be declared as a
+ * \c __global__ function. If the specified function does not exist,
+ * then ::cudaErrorInvalidDeviceFunction is returned. For templated functions,
+ * pass the function symbol as follows: func_name<template_arg_0,...,template_arg_N>
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param func        - Device function symbol
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa 
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
+ * ::cuFuncSetCacheConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
+
+/**
+ * \brief Find out attributes for a given function
+ *
+ * This function obtains the attributes of a function specified via \p func.
+ * \p func is a device function symbol and must be declared as a
+ * \c __global__ function. The fetched attributes are placed in \p attr.
+ * If the specified function does not exist, then
+ * ::cudaErrorInvalidDeviceFunction is returned. For templated functions, pass
+ * the function symbol as follows: func_name<template_arg_0,...,template_arg_N>
+ *
+ * Note that some function attributes such as
+ * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
+ * may vary based on the device that is currently being used.
+ *
+ * \param attr - Return pointer to function's attributes
+ * \param func - Device function symbol
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa 
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
+ * ::cuFuncGetAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+
+
+/**
+ * \brief Set attributes for a given function
+ *
+ * This function sets the attributes of a function specified via \p func.
+ * The parameter \p func must be a pointer to a function that executes
+ * on the device. The parameter specified by \p func must be declared as a \p __global__
+ * function. The enumeration defined by \p attr is set to the value defined by \p value.
+ * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ * If the specified attribute cannot be written, or if the value is incorrect, 
+ * then ::cudaErrorInvalidValue is returned.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ *
+ * \param func  - Function to get attributes of
+ * \param attr  - Attribute to set
+ * \param value - Value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value);
+
+/**
+ * \brief Returns the function name for a device entry function pointer.
+ *
+ * Returns in \p **name the function name associated with the symbol \p func .
+ * The function name is returned as a null-terminated string. This API may
+ * return a mangled name if the function is not declared as having C linkage.
+ * If \p **name is NULL, ::cudaErrorInvalidValue is returned. If \p func is
+ * not a device entry function, ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * \param name - The returned name of the function
+ * \param func - The function pointer to retrieve name for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaFuncGetName(const char **name, const T *func) "cudaFuncGetName (C++ API)"
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetName(const char **name, const void *func);
+
+/**
+ * \brief Returns the offset and size of a kernel parameter in the device-side parameter layout.
+ *
+ * Queries the kernel parameter at \p paramIndex in \p func's list of parameters and returns
+ * parameter information via \p paramOffset and \p paramSize. \p paramOffset returns the
+ * offset of the parameter in the device-side parameter layout. \p paramSize returns the size
+ * in bytes of the parameter. This information can be used to update kernel node parameters
+ * from the device via ::cudaGraphKernelNodeSetParam() and ::cudaGraphKernelNodeUpdatesApply().
+ * \p paramIndex must be less than the number of parameters that \p func takes.
+ *
+ * \param func        - The function to query
+ * \param paramIndex  - The parameter index to query
+ * \param paramOffset - The offset into the device-side parameter layout at which the parameter resides
+ * \param paramSize   - The size of the parameter in the device-side parameter layout
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetParamInfo(const void *func, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+
+/**
+ * \brief Converts a double argument to be executed on a device
+ *
+ * \param d - Double to convert
+ *
+ * \deprecated This function is deprecated as of CUDA 7.5
+ *
+ * Converts the double value of \p d to an internal float representation if
+ * the device does not support double arithmetic. If the device does natively
+ * support doubles, then this function does nothing.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForHost
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d);
+
+/**
+ * \brief Converts a double argument after execution on a device
+ *
+ * \deprecated This function is deprecated as of CUDA 7.5
+ *
+ * Converts the double value of \p d from a potentially internal float
+ * representation if the device does not support double arithmetic. If the
+ * device does natively support doubles, then this function does nothing.
+ *
+ * \param d - Double to convert
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice
+ */
+extern __CUDA_DEPRECATED  __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d);
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::cudaErrorNotPermitted, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in constrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamDestroy,
+ * ::cudaMallocManaged,
+ * ::cudaStreamAttachMemAsync,
+ * ::cudaStreamAddCallback,
+ * ::cuLaunchHostFunc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);
+
+/** @} */ /* END CUDART_EXECUTION */
+
+/**
+ * \defgroup CUDART_EXECUTION_DEPRECATED Execution Control [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated execution control functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated execution control functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the shared memory configuration for a device function
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will 
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions, 
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via 
+ * ::cudaFuncSetSharedMemConfig will override the device wide setting set by
+ * ::cudaDeviceSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance. 
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank 
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * The supported bank configurations are:
+ * - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration
+ *   when launching this function.
+ * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be 
+ *   four bytes natively when launching this function.
+ * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
+ *   bytes natively when launching this function.
+ *
+ * \param func   - Device function symbol
+ * \param config - Requested shared memory configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetSharedMemConfig,
+ * ::cudaDeviceGetSharedMemConfig,
+ * ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuFuncSetSharedMemConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config);
+/** @} */ /* END CUDART_EXECUTION_DEPRECATED */
+
+/**
+ * \defgroup CUDART_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Besides the occupancy calculator functions
+ * (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags),
+ * there are also C++ only occupancy-based launch configuration functions documented in
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * See
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)"
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy for a device function
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM.
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the block
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, const void *func, int numBlocks, int blockSize);
+
+/**
+ * \brief Returns occupancy for a device function with the specified flags
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect 
+ * the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaFuncGetAttributes
+ * \ref ::cudaOccupancyMaxPotentialClusterSize(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxPotentialClusterSize (C++ API)",
+ * ::cuOccupancyMaxPotentialClusterSize
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func, const cudaLaunchConfig_t *launchConfig);
+
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes), the cluster size from config must either be
+ * unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidClusterSize,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ * \ref ::cudaOccupancyMaxActiveClusters(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxActiveClusters (C++ API)",
+ * ::cuOccupancyMaxActiveClusters
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
+/** @} */ /* END CUDA_OCCUPANCY */
+
+/**
+ * \defgroup CUDART_MEMORY Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p size bytes of managed memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::cudaErrorNotSupported is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::cudaDevAttrManagedMemory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p size
+ * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
+ * default value for \p flags is ::cudaMemAttachGlobal.
+ * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
+ * any stream on any device. If ::cudaMemAttachHost is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
+ * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cudaStreamAttachMemAsync to
+ * a single stream, the default association, as specifed during ::cudaMallocManaged,
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::cudaMemAttachGlobal. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a system where all GPUs have a non-zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cudaMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
+ * value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all devices used in
+ * that process that support managed memory have to be peer-to-peer compatible
+ * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
+ * that supports managed memory is used and it is not peer-to-peer compatible with
+ * any of the other managed memory supporting devices that were previously used in
+ * that process, even if ::cudaDeviceReset has been called on those devices. These
+ * environment variables are described in the CUDA programming guide under the
+ * "CUDA environment variables" section.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync,
+ * ::cuMemAllocManaged
+ */
+#if defined(__cplusplus)
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags = cudaMemAttachGlobal);
+#else
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
+#endif
+
+/**
+ * \brief Allocate memory on the device
+ *
+ * Allocates \p size bytes of linear memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. The allocated memory is
+ * suitably aligned for any kind of variable. The memory is not cleared.
+ * ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure.
+ *
+ * The device version of ::cudaFree cannot be used with a \p *devPtr
+ * allocated using the host API, and vice versa.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMemAlloc
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+
+/**
+ * \brief Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy*(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). 
+
+ * On systems where ::pageableMemoryAccessUsesHostPageTables
+ * is true, ::cudaMallocHost may not page-lock the allocated memory.
+
+ * Page-locking excessive amounts of memory with ::cudaMallocHost() may degrade 
+ * system performance, since it reduces the amount of memory available to the 
+ * system for paging. As a result, this function is best used sparingly to allocate 
+ * staging areas for data exchange between host and device.
+ *
+ * \param ptr  - Pointer to allocated host memory
+ * \param size - Requested allocation size in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D,
+ * ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMemAllocHost
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size);
+
+/**
+ * \brief Allocates pitched memory on the device
+ *
+ * Allocates at least \p width (in bytes) * \p height bytes of linear memory
+ * on the device and returns in \p *devPtr a pointer to the allocated memory.
+ * The function may pad the allocation to ensure that corresponding pointers
+ * in any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. The pitch returned in
+ * \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation.
+ * The intended usage of \p pitch is as a separate parameter of the allocation,
+ * used to compute addresses within the 2D array. Given the row and column of
+ * an array element of type \p T, the address is computed as:
+ * \code
+    T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
+   \endcode
+ *
+ * For allocations of 2D arrays, it is recommended that programmers consider
+ * performing pitch allocations using ::cudaMallocPitch(). Due to pitch
+ * alignment restrictions in the hardware, this is especially true if the
+ * application will be performing 2D memory copies between different regions
+ * of device memory (whether linear memory or CUDA arrays).
+ *
+ * \param devPtr - Pointer to allocated pitched device memory
+ * \param pitch  - Pitch for allocation
+ * \param width  - Requested pitched allocation width (in bytes)
+ * \param height - Requested pitched allocation height
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * ::cudaHostAlloc,
+ * ::cuMemAllocPitch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
+
+/**
+ * \brief Allocate an array on the device
+ *
+ * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA array in \p *array.
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+    enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
+ * - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array.
+ * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. 
+ *   The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can 
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. 
+ *   The physical backing memory must be allocated via ::cuMemCreate.
+ *
+ * \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details.
+ *
+ * \param array  - Pointer to allocated array in device memory
+ * \param desc   - Requested channel format
+ * \param width  - Requested array allocation width
+ * \param height - Requested array allocation height
+ * \param flags  - Requested properties of allocated array
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * ::cudaHostAlloc,
+ * ::cuArrayCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0));
+
+/**
+ * \brief Frees memory on the device
+ *
+ * Frees the memory space pointed to by \p devPtr, which must have been
+ * returned by a previous call to one of the following memory allocation APIs -
+ * ::cudaMalloc(), ::cudaMallocPitch(), ::cudaMallocManaged(), ::cudaMallocAsync(),
+ * ::cudaMallocFromPoolAsync().
+ * 
+ * Note - This API will not perform any implicit synchronization when the pointer was
+ * allocated with ::cudaMallocAsync or ::cudaMallocFromPoolAsync. Callers must ensure
+ * that all accesses to the pointer have completed before invoking ::cudaFree. For
+ * best performance and memory reuse, users should use ::cudaFreeAsync to free memory
+ * allocated via the stream ordered memory allocator.
+ * 
+ * If ::cudaFree(\p devPtr) has already been called before,
+ * an error is returned. If \p devPtr is 0, no operation is performed.
+ * ::cudaFree() returns ::cudaErrorValue in case of failure.
+ *
+ * The device version of ::cudaFree cannot be used with a \p *devPtr
+ * allocated using the host API, and vice versa.
+ *
+ * \param devPtr - Device pointer to memory to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocManaged, ::cudaMallocArray, ::cudaFreeArray, ::cudaMallocAsync, ::cudaMallocFromPoolAsync
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaFreeAsync
+ * ::cudaHostAlloc,
+ * ::cuMemFree
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+
+/**
+ * \brief Frees page-locked memory
+ *
+ * Frees the memory space pointed to by \p hostPtr, which must have been
+ * returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc().
+ *
+ * \param ptr - Pointer to memory to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc,
+ * ::cuMemFreeHost
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr);
+
+/**
+ * \brief Frees an array on the device
+ *
+ * Frees the CUDA array \p array, which must have been returned by a
+ * previous call to ::cudaMallocArray(). If \p devPtr is 0,
+ * no operation is performed.
+ *
+ * \param array - Pointer to array to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuArrayDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array);
+
+/**
+ * \brief Frees a mipmapped array on the device
+ *
+ * Frees the CUDA mipmapped array \p mipmappedArray, which must have been 
+ * returned by a previous call to ::cudaMallocMipmappedArray(). If \p devPtr
+ * is 0, no operation is performed.
+ *
+ * \param mipmappedArray - Pointer to mipmapped array to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMipmappedArrayDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray);
+
+
+/**
+ * \brief Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
+ * can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between host
+ * and device.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes
+ * ::cudaHostAlloc() to emulate ::cudaMallocHost().
+ * - ::cudaHostAllocPortable: The memory returned by this call will be
+ * considered as pinned memory by all CUDA contexts, not just the one that
+ * performed the allocation.
+ * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
+ * The device pointer to the memory may be obtained by calling
+ * ::cudaHostGetDevicePointer().
+ * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
+ * WC memory can be transferred across the PCI Express bus more quickly on some
+ * system configurations, but cannot be read efficiently by most CPUs.  WC
+ * memory is a good option for buffers that will be written by the CPU and read
+ * by the device via mapped pinned memory or host->device transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * In order for the ::cudaHostAllocMapped flag to have any effect, the CUDA context
+ * must support the ::cudaDeviceMapHost flag, which can be checked via
+ * ::cudaGetDeviceFlags(). The ::cudaDeviceMapHost flag is implicitly set for
+ * contexts created via the runtime API.
+ *
+ * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
+ * that do not support mapped pinned memory. The failure is deferred to
+ * ::cudaHostGetDevicePointer() because the memory may be mapped into other
+ * CUDA contexts via the ::cudaHostAllocPortable flag.
+ *
+ * Memory allocated by this function must be freed with ::cudaFreeHost().
+ *
+ * \param pHost - Device pointer to allocated memory
+ * \param size  - Requested allocation size in bytes
+ * \param flags - Requested properties of allocated memory
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost,
+ * ::cudaGetDeviceFlags,
+ * ::cuMemHostAlloc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags);
+
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p ptr and \p size and maps it
+ * for the device(s) as specified by \p flags. This memory range also is added
+ * to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate
+ * calls to functions such as ::cudaMemcpy(). Since the memory can be accessed 
+ * directly by the device, it can be read or written with much higher bandwidth 
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ * 
+ * On systems where ::pageableMemoryAccessUsesHostPageTables is true, ::cudaHostRegister 
+ * will not page-lock the memory range specified by \p ptr but only populate 
+ * unpopulated pages.
+ *
+ * ::cudaHostRegister is supported only on I/O coherent devices that have a non-zero
+ * value for the device attribute ::cudaDevAttrHostRegisterSupported.
+ *
+ * The \p flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::cudaHostRegisterDefault: On a system with unified virtual addressing,
+ *   the memory will be both mapped and portable.  On a system with no unified
+ *   virtual addressing, the memory will be neither mapped nor portable.
+ *
+ * - ::cudaHostRegisterPortable: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cudaHostGetDevicePointer().
+ *
+ * - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as
+ *   pointing to some memory-mapped I/O space, e.g. belonging to a
+ *   third-party PCIe device, and it will marked as non cache-coherent and
+ *   contiguous.
+ *
+ * - ::cudaHostRegisterReadOnly: The passed memory pointer is treated as
+ *   pointing to memory that is considered read-only by the device.  On
+ *   platforms without ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, this
+ *   flag is required in order to register memory mapped to the CPU as
+ *   read-only.  Support for the use of this flag can be queried from the device
+ *   attribute cudaDeviceAttrReadOnlyHostRegisterSupported.  Using this flag with
+ *   a current context associated with a device that does not have this attribute
+ *   set will cause ::cudaHostRegister to error with cudaErrorNotSupported.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The CUDA context must have been created with the ::cudaMapHost flag in
+ * order for the ::cudaHostRegisterMapped flag to have any effect.
+ *
+ * The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cudaHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::cudaHostRegisterPortable flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
+ * can also be accessed from the device using the host pointer \p ptr.
+ * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cudaHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with ::cudaHostUnregister().
+ *
+ * \param ptr   - Host pointer to memory to page-lock
+ * \param size  - Size in bytes of the address range to page-lock in bytes
+ * \param flags - Flags for allocation request
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorHostMemoryAlreadyRegistered,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer,
+ * ::cuMemHostRegister
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cudaHostRegister
+ *
+ * Unmaps the memory range whose base address is specified by \p ptr, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cudaHostRegister().
+ *
+ * \param ptr - Host pointer to memory to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorHostMemoryNotRegistered
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostUnregister,
+ * ::cuMemHostUnregister
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr);
+
+/**
+ * \brief Passes back device pointer of mapped host memory allocated by
+ * cudaHostAlloc or registered by cudaHostRegister
+ *
+ * Passes back the device pointer corresponding to the mapped, pinned host
+ * buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister().
+ *
+ * ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was
+ * not specified before deferred context creation occurred, or if called on a
+ * device that does not support mapped, pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
+ * can also be accessed from the device using the host pointer \p pHost.
+ * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
+ * match the original host pointer \p pHost and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
+ * will match the original pointer \p pHost. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cudaHostGetDevicePointer() will not match the original host pointer \p pHost,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * \p flags provides for future releases.  For now, it must be set to 0.
+ *
+ * \param pDevice - Returned device pointer for mapped memory
+ * \param pHost   - Requested host pointer mapping
+ * \param flags   - Flags for extensions (must be 0 for now)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags, ::cudaHostAlloc,
+ * ::cuMemHostGetDevicePointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);
+
+/**
+ * \brief Passes back flags used to allocate pinned host memory allocated by
+ * cudaHostAlloc
+ *
+ * ::cudaHostGetFlags() will fail if the input pointer does not
+ * reside in an address range allocated by ::cudaHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param pHost - Host pointer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostAlloc,
+ * ::cuMemHostGetFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost);
+
+/**
+ * \brief Allocates logical 1D, 2D, or 3D memory objects on the device
+ *
+ * Allocates at least \p width * \p height * \p depth bytes of linear memory
+ * on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer
+ * to the allocated memory. The function may pad the allocation to ensure
+ * hardware alignment requirements are met. The pitch returned in the \p pitch
+ * field of \p pitchedDevPtr is the width in bytes of the allocation.
+ *
+ * The returned ::cudaPitchedPtr contains additional fields \p xsize and
+ * \p ysize, the logical width and height of the allocation, which are
+ * equivalent to the \p width and \p height \p extent parameters provided by
+ * the programmer during allocation.
+ *
+ * For allocations of 2D and 3D objects, it is highly recommended that
+ * programmers perform allocations using ::cudaMalloc3D() or
+ * ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing memory copies
+ * involving 2D or 3D objects (whether linear memory or CUDA arrays).
+ *
+ * \param pitchedDevPtr  - Pointer to allocated pitched device memory
+ * \param extent         - Requested allocation size (\p width field in bytes)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D,
+ * ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent,
+ * ::cuMemAllocPitch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent);
+
+/**
+ * \brief Allocate an array on the device
+ *
+ * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA array in \p *array.
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+        enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * ::cudaMalloc3DArray() can allocate the following:
+ *
+ * - A 1D array is allocated if the height and depth extents are both zero.
+ * - A 2D array is allocated if only the depth extent is zero.
+ * - A 3D array is allocated if all three extents are non-zero.
+ * - A 1D layered CUDA array is allocated if only the height extent is zero and
+ * the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is 
+ * determined by the depth extent.
+ * - A 2D layered CUDA array is allocated if all three extents are non-zero and 
+ * the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is 
+ * determined by the depth extent.
+ * - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is
+ * a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. 
+ * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
+ * - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both,
+ * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
+ * a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists 
+ * of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form 
+ * the second cubemap, and so on.
+ *
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
+ * - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers
+ * - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six.
+ *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
+ * - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface
+ *   reference.
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
+ *   array. Texture gather can only be performed on 2D CUDA arrays.
+ * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array 
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for 
+ *   creating 2D, 3D or 2D layered sparse CUDA arrays. The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated
+ *   via ::cuMemCreate.
+ *
+ * The width, height and depth extents must meet certain size requirements as listed in the following table.
+ * All values are specified in elements.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that
+ * case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0).
+ *
+ * \xmlonly
+ * <table outputclass="xmlonly">
+ * <tgroup cols="3" colsep="1" rowsep="1">
+ * <colspec colname="c1" colwidth="1.0*"/>
+ * <colspec colname="c2" colwidth="3.0*"/>
+ * <colspec colname="c3" colwidth="3.0*"/>
+ * <thead>
+ * <row>
+ * <entry>CUDA array type</entry>
+ * <entry>Valid extents that must always be met {(width range in elements),
+ * (height range), (depth range)}</entry>
+ * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in
+ * elements), (height range), (depth range)}</entry>
+ * </row>
+ * </thead>
+ * <tbody>
+ * <row>
+ * <entry>1D</entry>
+ * <entry>{ (1,maxTexture1D), 0, 0 }</entry>
+ * <entry>{ (1,maxSurface1D), 0, 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>2D</entry>
+ * <entry>{ (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 }</entry>
+ * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>3D</entry>
+ * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
+ * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
+ * (1,maxTexture3DAlt[2]) }</entry>
+ * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>1D Layered</entry>
+ * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
+ * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry>
+ * </row>
+ * <row>
+ * <entry>2D Layered</entry>
+ * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
+ * (1,maxTexture2DLayered[2]) }</entry>
+ * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
+ * (1,maxSurface2DLayered[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap</entry>
+ * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
+ * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap Layered</entry>
+ * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
+ * (1,maxTextureCubemapLayered[1]) }</entry>
+ * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
+ * (1,maxSurfaceCubemapLayered[1]) }</entry>
+ * </row>
+ * </tbody>
+ * </tgroup>
+ * </table>
+ * \endxmlonly
+ *
+ * \param array  - Pointer to allocated array in device memory
+ * \param desc   - Requested channel format
+ * \param extent - Requested allocation size (\p width field in elements)
+ * \param flags  - Flags for extensions
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuArray3DCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0));
+
+/**
+ * \brief Allocate a mipmapped array on the device
+ *
+ * Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray.
+ * \p numLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+        enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * ::cudaMallocMipmappedArray() can allocate the following:
+ *
+ * - A 1D mipmapped array is allocated if the height and depth extents are both zero.
+ * - A 2D mipmapped array is allocated if only the depth extent is zero.
+ * - A 3D mipmapped array is allocated if all three extents are non-zero.
+ * - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and
+ * the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is 
+ * determined by the depth extent.
+ * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and 
+ * the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is 
+ * determined by the depth extent.
+ * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six.
+ * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
+ * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both,
+ * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
+ * a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped
+ * array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the 
+ * first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on.
+ *
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation
+ * - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers
+ * - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six.
+ *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
+ * - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array 
+ *   will be read from or written to using a surface reference.
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
+ *   array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are
+ *   performed only on the most detailed mipmap level.
+ * - ::cudaArraySparse: Allocates a CUDA mipmapped array without physical backing memory. The subregions within this sparse array
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for creating 
+ *   2D, 3D or 2D layered sparse CUDA mipmapped arrays. The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA mipmapped array without physical backing memory. The entire array can
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated
+ *   via ::cuMemCreate.
+ *
+ * The width, height and depth extents must meet certain size requirements as listed in the following table.
+ * All values are specified in elements.
+ *
+ * \xmlonly
+ * <table outputclass="xmlonly">
+ * <tgroup cols="3" colsep="1" rowsep="1">
+ * <colspec colname="c1" colwidth="1.0*"/>
+ * <colspec colname="c2" colwidth="3.0*"/>
+ * <colspec colname="c3" colwidth="3.0*"/>
+ * <thead>
+ * <row>
+ * <entry>CUDA array type</entry>
+ * <entry>Valid extents that must always be met {(width range in elements),
+ * (height range), (depth range)}</entry>
+ * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in
+ * elements), (height range), (depth range)}</entry>
+ * </row>
+ * </thead>
+ * <tbody>
+ * <row>
+ * <entry>1D</entry>
+ * <entry>{ (1,maxTexture1DMipmap), 0, 0 }</entry>
+ * <entry>{ (1,maxSurface1D), 0, 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>2D</entry>
+ * <entry>{ (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 }</entry>
+ * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>3D</entry>
+ * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
+ * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
+ * (1,maxTexture3DAlt[2]) }</entry>
+ * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>1D Layered</entry>
+ * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
+ * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry>
+ * </row>
+ * <row>
+ * <entry>2D Layered</entry>
+ * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
+ * (1,maxTexture2DLayered[2]) }</entry>
+ * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
+ * (1,maxSurface2DLayered[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap</entry>
+ * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
+ * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap Layered</entry>
+ * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
+ * (1,maxTextureCubemapLayered[1]) }</entry>
+ * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
+ * (1,maxSurfaceCubemapLayered[1]) }</entry>
+ * </row>
+ * </tbody>
+ * </tgroup>
+ * </table>
+ * \endxmlonly
+ *
+ * \param mipmappedArray  - Pointer to allocated mipmapped array in device memory
+ * \param desc            - Requested channel format
+ * \param extent          - Requested allocation size (\p width field in elements)
+ * \param numLevels       - Number of mipmap levels to allocate
+ * \param flags           - Flags for extensions
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuMipmappedArrayCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0));
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *levelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p mipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::cudaErrorInvalidValue is returned.
+ *
+ * If \p mipmappedArray is NULL,
+ * ::cudaErrorInvalidResourceHandle is returned.
+ *
+ * \param levelArray     - Returned mipmap level CUDA array
+ * \param mipmappedArray - CUDA mipmapped array
+ * \param level          - Mipmap level
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuMipmappedArrayGetLevel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level);
+
+/**
+ * \brief Copies data between 3D objects
+ *
+\code
+struct cudaExtent {
+  size_t width;
+  size_t height;
+  size_t depth;
+};
+struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
+
+struct cudaPos {
+  size_t x;
+  size_t y;
+  size_t z;
+};
+struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
+
+struct cudaMemcpy3DParms {
+  cudaArray_t           srcArray;
+  struct cudaPos        srcPos;
+  struct cudaPitchedPtr srcPtr;
+  cudaArray_t           dstArray;
+  struct cudaPos        dstPos;
+  struct cudaPitchedPtr dstPtr;
+  struct cudaExtent     extent;
+  enum cudaMemcpyKind   kind;
+};
+\endcode
+ *
+ * ::cudaMemcpy3D() copies data betwen two 3D objects. The source and
+ * destination objects may be in either host memory, device memory, or a CUDA
+ * array. The source, destination, extent, and kind of copy performed is
+ * specified by the ::cudaMemcpy3DParms struct which should be initialized to
+ * zero before use:
+\code
+cudaMemcpy3DParms myParms = {0};
+\endcode
+ *
+ * The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or
+ * \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
+ * non-zero source or destination will cause ::cudaMemcpy3D() to return an
+ * error.
+ *
+ * The \p srcPos and \p dstPos fields are optional offsets into the source and
+ * destination objects and are defined in units of each object's elements. The
+ * element for a host or device pointer is assumed to be <b>unsigned char</b>.
+ *
+ * The \p extent field defines the dimensions of the transferred area in
+ * elements. If a CUDA array is participating in the copy, the extent is
+ * defined in terms of that array's elements. If no CUDA array is
+ * participating in the copy then the extents are defined in elements of
+ * <b>unsigned char</b>.
+ *
+ * The \p kind field defines the direction of the copy. It must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost
+ * passed as kind and cudaArray type passed as source or destination, if the kind
+ * implies cudaArray type to be present on the host, ::cudaMemcpy3D() will
+ * disregard that implication and silently correct the kind based on the fact that
+ * cudaArray type can only be present on the device.
+ *
+ * If the source and destination are both arrays, ::cudaMemcpy3D() will return
+ * an error if they do not have the same element size.
+ *
+ * The source and destination object may not overlap. If overlapping source
+ * and destination objects are specified, undefined behavior will result.
+ *
+ * The source object must entirely contain the region defined by \p srcPos
+ * and \p extent. The destination object must entirely contain the region
+ * defined by \p dstPos and \p extent.
+ *
+ * ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr
+ * exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated
+ * with ::cudaMalloc3D() will always be valid.
+ *
+ * \param p - 3D memory copy parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync,
+ * ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::make_cudaExtent, ::make_cudaPos,
+ * ::cuMemcpy3D
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
+
+/**
+ * \brief Copies memory between devices
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
+ * for documentation of its parameters.
+ *
+ * Note that this function is synchronous with respect to the host only if
+ * the source or destination of the transfer is host memory.  Note also 
+ * that this copy is serialized with respect to all pending and future 
+ * asynchronous work in to the current device, the copy's source device,
+ * and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid 
+ * this synchronization).
+ *
+ * \param p - Parameters for the memory copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidPitchValue
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpy3DPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
+
+/**
+ * \brief Copies data between 3D objects
+ *
+\code
+struct cudaExtent {
+  size_t width;
+  size_t height;
+  size_t depth;
+};
+struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
+
+struct cudaPos {
+  size_t x;
+  size_t y;
+  size_t z;
+};
+struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
+
+struct cudaMemcpy3DParms {
+  cudaArray_t           srcArray;
+  struct cudaPos        srcPos;
+  struct cudaPitchedPtr srcPtr;
+  cudaArray_t           dstArray;
+  struct cudaPos        dstPos;
+  struct cudaPitchedPtr dstPtr;
+  struct cudaExtent     extent;
+  enum cudaMemcpyKind   kind;
+};
+\endcode
+ *
+ * ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and
+ * destination objects may be in either host memory, device memory, or a CUDA
+ * array. The source, destination, extent, and kind of copy performed is
+ * specified by the ::cudaMemcpy3DParms struct which should be initialized to
+ * zero before use:
+\code
+cudaMemcpy3DParms myParms = {0};
+\endcode
+ *
+ * The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray
+ * or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
+ * non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an
+ * error.
+ *
+ * The \p srcPos and \p dstPos fields are optional offsets into the source and
+ * destination objects and are defined in units of each object's elements. The
+ * element for a host or device pointer is assumed to be <b>unsigned char</b>.
+ * For CUDA arrays, positions must be in the range [0, 2048) for any
+ * dimension.
+ *
+ * The \p extent field defines the dimensions of the transferred area in
+ * elements. If a CUDA array is participating in the copy, the extent is
+ * defined in terms of that array's elements. If no CUDA array is
+ * participating in the copy then the extents are defined in elements of
+ * <b>unsigned char</b>.
+ *
+ * The \p kind field defines the direction of the copy. It must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost
+ * passed as kind and cudaArray type passed as source or destination, if the kind
+ * implies cudaArray type to be present on the host, ::cudaMemcpy3DAsync() will
+ * disregard that implication and silently correct the kind based on the fact that
+ * cudaArray type can only be present on the device.
+ *
+ * If the source and destination are both arrays, ::cudaMemcpy3DAsync() will
+ * return an error if they do not have the same element size.
+ *
+ * The source and destination object may not overlap. If overlapping source
+ * and destination objects are specified, undefined behavior will result.
+ *
+ * The source object must lie entirely within the region defined by \p srcPos
+ * and \p extent. The destination object must lie entirely within the region
+ * defined by \p dstPos and \p extent.
+ *
+ * ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or
+ * \p dstPtr exceeds the maximum allowed. The pitch of a
+ * ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid.
+ *
+ * ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param p      - 3D memory copy parameters
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D,
+ * ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, :::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::make_cudaExtent, ::make_cudaPos,
+ * ::cuMemcpy3DAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies memory between devices asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
+ * for documentation of its parameters.
+ *
+ * \param p      - Parameters for the memory copy
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidPitchValue
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpy3DPeerAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Gets free and total device memory
+ *
+ * Returns in \p *total the total amount of memory available to the the current context.
+ * Returns in \p *free the amount of memory on the device that is free according to the OS.
+ * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
+ * In a multi-tenet situation, free estimate returned is prone to race condition where
+ * a new allocation/free done by a different process or a different thread in the same
+ * process between the time when free memory was estimated and reported, will result in
+ * deviation in free value reported and actual free memory.
+ *
+ * The integrated GPU on Tegra shares memory with CPU and other component
+ * of the SoC. The free and total values returned by the API excludes
+ * the SWAP memory space maintained by the OS on some platforms.
+ * The OS may move some of the memory pages into swap area as the GPU or
+ * CPU allocate or access memory. See Tegra app note on how to calculate
+ * total and free memory on Tegra.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuMemGetInfo
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Gets info about the specified cudaArray
+ * 
+ * Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape 
+ * and flags of \p array.
+ *
+ * Any of \p *desc, \p *extent and \p *flags may be specified as NULL.
+ *
+ * \param desc   - Returned array type
+ * \param extent - Returned array shape. 2D arrays will have depth of zero
+ * \param flags  - Returned array flags
+ * \param array  - The ::cudaArray to get info for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuArrayGetDescriptor,
+ * ::cuArray3DGetDescriptor
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array);
+
+/**
+ * \brief Gets a CUDA array plane from a CUDA array
+ *
+ * Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ * of the CUDA array \p hArray.
+ *
+ * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ * not have a multi-planar format e.g: ::cudaChannelFormatKindNV12, then ::cudaErrorInvalidValue is returned.
+ *
+ * Note that if the \p hArray has format ::cudaChannelFormatKindNV12, then passing in 0 for \p planeIdx returns
+ * a CUDA array of the same size as \p hArray but with one 8-bit channel and ::cudaChannelFormatKindUnsigned as its format kind.
+ * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ * of \p hArray with two 8-bit channels and ::cudaChannelFormatKindUnsigned as its format kind.
+ *
+ * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ * \param hArray        - CUDA array
+ * \param planeIdx      - Plane index
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cuArrayGetPlane
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx);
+
+/**
+ * \brief Returns the memory requirements of a CUDA array
+ *
+ * Returns the memory requirements of a CUDA array in \p memoryRequirements
+ * If the CUDA array is not allocated with flag ::cudaArrayDeferredMapping
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * The returned value in ::cudaArrayMemoryRequirements::size
+ * represents the total size of the CUDA array.
+ * The returned value in ::cudaArrayMemoryRequirements::alignment
+ * represents the alignment necessary for mapping the CUDA array.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements
+ * \param[in] array - CUDA array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cudaMipmappedArrayGetMemoryRequirements
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements  *memoryRequirements, cudaArray_t array, int device);
+
+/**
+ * \brief Returns the memory requirements of a CUDA mipmapped array
+ *
+ * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
+ * If the CUDA mipmapped array is not allocated with flag ::cudaArrayDeferredMapping
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * The returned value in ::cudaArrayMemoryRequirements::size
+ * represents the total size of the CUDA mipmapped array.
+ * The returned value in ::cudaArrayMemoryRequirements::alignment
+ * represents the alignment necessary for mapping the CUDA mipmapped
+ * array.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements
+ * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cudaArrayGetMemoryRequirements
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements *memoryRequirements, cudaMipmappedArray_t mipmap, int device);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA array
+ *
+ * Returns the layout properties of a sparse CUDA array in \p sparseProperties.
+ * If the CUDA array is not allocated with flag ::cudaArraySparse
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * If the returned value in ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail,
+ * then ::cudaArraySparseProperties::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ * Also, the returned value in ::cudaArraySparseProperties::miptailFirstLevel is always zero.
+ * Note that the \p array must have been allocated using ::cudaMallocArray or ::cudaMalloc3DArray. For CUDA arrays obtained
+ * using ::cudaMipmappedArrayGetLevel, ::cudaErrorInvalidValue will be returned. Instead, ::cudaMipmappedArrayGetSparseProperties
+ * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] sparseProperties - Pointer to return the ::cudaArraySparseProperties
+ * \param[in] array             - The CUDA array to get the sparse properties of 
+ *
+ * \sa
+ * ::cudaMipmappedArrayGetSparseProperties,
+ * ::cuMemMapArrayAsync
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaArray_t array);
+#endif
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA mipmapped array
+ *
+ * Returns the sparse array layout properties in \p sparseProperties.
+ * If the CUDA mipmapped array is not allocated with flag ::cudaArraySparse
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * For non-layered CUDA mipmapped arrays, ::cudaArraySparseProperties::miptailSize returns the
+ * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ * is less than that of the tile.
+ * For layered CUDA mipmapped arrays, if ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail,
+ * then ::cudaArraySparseProperties::miptailSize specifies the size of the mip tail of all layers combined.
+ * Otherwise, ::cudaArraySparseProperties::miptailSize specifies mip tail size per layer.
+ * The returned value of ::cudaArraySparseProperties::miptailFirstLevel is valid only if ::cudaArraySparseProperties::miptailSize is non-zero.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] sparseProperties - Pointer to return ::cudaArraySparseProperties
+ * \param[in] mipmap            - The CUDA mipmapped array to get the sparse properties of
+ *
+ * \sa
+ * ::cudaArrayGetSparseProperties,
+ * ::cuMemMapArrayAsync
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaMipmappedArray_t mipmap);
+#endif
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * memory area pointed to by \p dst, where \p kind specifies the direction
+ * of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Calling
+ * ::cudaMemcpy() with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * \param dst   - Destination memory address
+ * \param src   - Source memory address
+ * \param count - Size in bytes to copy
+ * \param kind  - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyDtoH,
+ * ::cuMemcpyHtoD,
+ * ::cuMemcpyDtoD,
+ * ::cuMemcpy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies memory between two devices
+ *
+ * Copies memory from one device to memory on another device.  \p dst is the 
+ * base device pointer of the destination memory and \p dstDevice is the 
+ * destination device.  \p src is the base device pointer of the source memory 
+ * and \p srcDevice is the source device.  \p count specifies the number of bytes 
+ * to copy.
+ *
+ * Note that this function is asynchronous with respect to the host, but 
+ * serialized with respect all pending and future asynchronous work in to the 
+ * current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync 
+ * to avoid this synchronization).
+ *
+ * \param dst       - Destination device pointer
+ * \param dstDevice - Destination device
+ * \param src       - Source device pointer
+ * \param srcDevice - Source device
+ * \param count     - Size of memory copy in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpyPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. \p dpitch and
+ * \p spitch are the widths in memory in bytes of the 2D arrays pointed to by
+ * \p dst and \p src, including any padding added to the end of each row. The
+ * memory areas may not overlap. \p width must not exceed either \p dpitch or
+ * \p spitch. Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do
+ * not match the direction of the copy results in an undefined behavior.
+ * ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds
+ * the maximum allowed.
+ *
+ * \param dst    - Destination memory address
+ * \param dpitch - Pitch of destination memory
+ * \param src    - Source memory address
+ * \param spitch - Pitch of source memory
+ * \param width  - Width of matrix transfer (columns in bytes)
+ * \param height - Height of matrix transfer (rows)
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the CUDA array \p dst starting at
+ * \p hOffset rows and \p wOffset bytes from the upper left corner,
+ * where \p kind specifies the direction of the copy, and must be one
+ * of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p spitch is the width in memory in bytes of the 2D array pointed to by
+ * \p src, including any padding added to the end of each row. \p wOffset +
+ * \p width must not exceed the width of the CUDA array \p dst. \p width must
+ * not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch
+ * exceeds the maximum allowed.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param spitch  - Pitch of source memory
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffset rows and \p wOffset bytes from the
+ * upper left corner to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. \p dpitch is the
+ * width in memory in bytes of the 2D array pointed to by \p dst, including any
+ * padding added to the end of each row. \p wOffset + \p width must not exceed
+ * the width of the CUDA array \p src. \p width must not exceed \p dpitch.
+ * ::cudaMemcpy2DFromArray() returns an error if \p dpitch exceeds the maximum
+ * allowed.
+ *
+ * \param dst     - Destination memory address
+ * \param dpitch  - Pitch of destination memory
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffsetSrc rows and \p wOffsetSrc bytes from the
+ * upper left corner to the CUDA array \p dst starting at \p hOffsetDst rows
+ * and \p wOffsetDst bytes from the upper left corner, where \p kind
+ * specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p wOffsetDst + \p width must not exceed the width of the CUDA array \p dst.
+ * \p wOffsetSrc + \p width must not exceed the width of the CUDA array \p src.
+ *
+ * \param dst        - Destination memory address
+ * \param wOffsetDst - Destination starting X offset (columns in bytes)
+ * \param hOffsetDst - Destination starting Y offset (rows)
+ * \param src        - Source memory address
+ * \param wOffsetSrc - Source starting X offset (columns in bytes)
+ * \param hOffsetSrc - Source starting Y offset (rows)
+ * \param width      - Width of matrix transfer (columns in bytes)
+ * \param height     - Height of matrix transfer (rows)
+ * \param kind       - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+
+/**
+ * \brief Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area pointed to by \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param symbol - Device symbol address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,  ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy,
+ * ::cuMemcpyHtoD,
+ * ::cuMemcpyDtoD
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
+
+
+/**
+ * \brief Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy,
+ * ::cuMemcpyDtoH,
+ * ::cuMemcpyDtoD
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
+
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * memory area pointed to by \p dst, where \p kind specifies the
+ * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * 
+ * The memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and
+ * \p src pointers that do not match the direction of the copy results in an
+ * undefined behavior.
+ *
+ * ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call
+ * may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is
+ * non-zero, the copy may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param dst    - Destination memory address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyDtoHAsync,
+ * ::cuMemcpyHtoDAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies memory between two devices asynchronously.
+ *
+ * Copies memory from one device to memory on another device.  \p dst is the 
+ * base device pointer of the destination memory and \p dstDevice is the 
+ * destination device.  \p src is the base device pointer of the source memory 
+ * and \p srcDevice is the source device.  \p count specifies the number of bytes 
+ * to copy.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param dst       - Destination device pointer
+ * \param dstDevice - Destination device
+ * \param src       - Source device pointer
+ * \param srcDevice - Source device
+ * \param count     - Size of memory copy in bytes
+ * \param stream    - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpyPeerAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p dpitch and \p spitch are the widths in memory in bytes of the 2D arrays
+ * pointed to by \p dst and \p src, including any padding added to the end of
+ * each row. The memory areas may not overlap. \p width must not exceed either
+ * \p dpitch or \p spitch.
+ *
+ * Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not
+ * match the direction of the copy results in an undefined behavior.
+ * ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater
+ * than the maximum allowed.
+ *
+ * ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
+ * \p stream is non-zero, the copy may overlap with operations in other
+ * streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param dst    - Destination memory address
+ * \param dpitch - Pitch of destination memory
+ * \param src    - Source memory address
+ * \param spitch - Pitch of source memory
+ * \param width  - Width of matrix transfer (columns in bytes)
+ * \param height - Height of matrix transfer (rows)
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the CUDA array \p dst starting at \p hOffset
+ * rows and \p wOffset bytes from the upper left corner, where \p kind specifies
+ * the direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p spitch is the width in memory in bytes of the 2D array pointed to by
+ * \p src, including any padding added to the end of each row. \p wOffset +
+ * \p width must not exceed the width of the CUDA array \p dst. \p width must
+ * not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if
+ * \p spitch exceeds the maximum allowed.
+ *
+ * ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
+ * \p stream is non-zero, the copy may overlap with operations in other
+ * streams.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param spitch  - Pitch of source memory
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ *
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffset rows and \p wOffset bytes from the
+ * upper left corner to the memory area pointed to by \p dst,
+ * where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p dpitch is the width in memory in bytes of the 2D
+ * array pointed to by \p dst, including any padding added to the end of each
+ * row. \p wOffset + \p width must not exceed the width of the CUDA array
+ * \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync()
+ * returns an error if \p dpitch exceeds the maximum allowed.
+ *
+ * ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is
+ * non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param dpitch  - Pitch of destination memory
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ *
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area pointed to by \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
+ * may overlap with operations in other streams.
+ *
+ * \param symbol - Device symbol address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyHtoDAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that resides in
+ * global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
+ * with operations in other streams.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyDtoHAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Fills the first \p count bytes of the memory area pointed to by \p devPtr
+ * with the constant byte value \p value.
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p devPtr refers to pinned host memory.
+ *
+ * \param devPtr - Pointer to device memory
+ * \param value  - Value to set for each byte of specified memory
+ * \param count  - Size in bytes to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuMemsetD8,
+ * ::cuMemsetD16,
+ * ::cuMemsetD32
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Sets to the specified value \p value a matrix (\p height rows of \p width
+ * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
+ * 2D array pointed to by \p dstPtr, including any padding added to the end
+ * of each row. This function performs fastest when the pitch is one that has
+ * been passed back by ::cudaMallocPitch().
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p devPtr refers to pinned host memory.
+ *
+ * \param devPtr - Pointer to 2D device memory
+ * \param pitch  - Pitch in bytes of 2D device memory(Unused if \p height is 1)
+ * \param value  - Value to set for each byte of specified memory
+ * \param width  - Width of matrix set (columns in bytes)
+ * \param height - Height of matrix set (rows)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync,
+ * ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD2D8,
+ * ::cuMemsetD2D16,
+ * ::cuMemsetD2D32
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Initializes each element of a 3D array to the specified value \p value.
+ * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
+ * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
+ * to by \p pitchedDevPtr, including any padding added to the end of each row.
+ * The \p xsize field specifies the logical width of each row in bytes, while
+ * the \p ysize field specifies the height of each 2D slice in rows.
+ * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth 
+ * are both equal to 1. 
+ *
+ * The extents of the initialized region are specified as a \p width in bytes,
+ * a \p height in rows, and a \p depth in slices.
+ *
+ * Extents with \p width greater than or equal to the \p xsize of
+ * \p pitchedDevPtr may perform significantly faster than extents narrower
+ * than the \p xsize. Secondarily, extents with \p height equal to the
+ * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
+ * shorter than the \p ysize.
+ *
+ * This function performs fastest when the \p pitchedDevPtr has been allocated
+ * by ::cudaMalloc3D().
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p pitchedDevPtr refers to pinned host memory.
+ *
+ * \param pitchedDevPtr - Pointer to pitched device memory
+ * \param value         - Value to set for each byte of specified memory
+ * \param extent        - Size parameters for where to set device memory (\p width field in bytes)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D,
+ * ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cudaMalloc3D, ::make_cudaPitchedPtr,
+ * ::make_cudaExtent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Fills the first \p count bytes of the memory area pointed to by \p devPtr
+ * with the constant byte value \p value.
+ *
+ * ::cudaMemsetAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param devPtr - Pointer to device memory
+ * \param value  - Value to set for each byte of specified memory
+ * \param count  - Size in bytes to set
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD8Async,
+ * ::cuMemsetD16Async,
+ * ::cuMemsetD32Async
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Sets to the specified value \p value a matrix (\p height rows of \p width
+ * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
+ * 2D array pointed to by \p dstPtr, including any padding added to the end
+ * of each row. This function performs fastest when the pitch is one that has
+ * been passed back by ::cudaMallocPitch().
+ *
+ * ::cudaMemset2DAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param devPtr - Pointer to 2D device memory
+ * \param pitch  - Pitch in bytes of 2D device memory(Unused if \p height is 1)
+ * \param value  - Value to set for each byte of specified memory
+ * \param width  - Width of matrix set (columns in bytes)
+ * \param height - Height of matrix set (rows)
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemsetAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async,
+ * ::cuMemsetD2D32Async
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Initializes each element of a 3D array to the specified value \p value.
+ * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
+ * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
+ * to by \p pitchedDevPtr, including any padding added to the end of each row.
+ * The \p xsize field specifies the logical width of each row in bytes, while
+ * the \p ysize field specifies the height of each 2D slice in rows.
+ * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth 
+ * are both equal to 1. 
+ *
+ * The extents of the initialized region are specified as a \p width in bytes,
+ * a \p height in rows, and a \p depth in slices.
+ *
+ * Extents with \p width greater than or equal to the \p xsize of
+ * \p pitchedDevPtr may perform significantly faster than extents narrower
+ * than the \p xsize. Secondarily, extents with \p height equal to the
+ * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
+ * shorter than the \p ysize.
+ *
+ * This function performs fastest when the \p pitchedDevPtr has been allocated
+ * by ::cudaMalloc3D().
+ *
+ * ::cudaMemset3DAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param pitchedDevPtr - Pointer to pitched device memory
+ * \param value         - Value to set for each byte of specified memory
+ * \param extent        - Size parameters for where to set device memory (\p width field in bytes)
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemsetAsync, ::cudaMemset2DAsync,
+ * ::cudaMalloc3D, ::make_cudaPitchedPtr,
+ * ::make_cudaExtent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Finds the address associated with a CUDA symbol
+ *
+ * Returns in \p *devPtr the address of symbol \p symbol on the device.
+ * \p symbol is a variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared in the
+ * global or constant memory space, \p *devPtr is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param devPtr - Return device pointer associated with symbol
+ * \param symbol - Device symbol address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)",
+ * ::cuModuleGetGlobal
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol);
+
+/**
+ * \brief Finds the size of the object associated with a CUDA symbol
+ *
+ * Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that
+ * resides in global or constant memory space. If \p symbol cannot be found, or
+ * if \p symbol is not declared in global or constant memory space, \p *size is
+ * unchanged and the error ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param size   - Size of object associated with symbol
+ * \param symbol - Device symbol address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)",
+ * ::cuModuleGetGlobal
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol);
+
+/**
+ * \brief Prefetches memory to the specified destination device
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the 
+ * base device pointer of the memory to be prefetched and \p dstDevice is the 
+ * destination device. \p count specifies the number of bytes to copy. \p stream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables.
+ *
+ * Passing in cudaCpuDeviceId for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * must be non-zero. Additionally, \p stream must be associated with a device that has a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cudaMallocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cudaMemAdvise as described
+ * below:
+ *
+ * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param stream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise, ::cudaMemAdvise_v2
+ * ::cuMemPrefetchAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0));
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync_v2(const void *devPtr, size_t count, struct cudaMemLocation location, unsigned int flags, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::cudaMemAdviceUnsetReadMostly: Undoes the effect of ::cudaMemAdviceReadMostly and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in cudaCpuDeviceId for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cudaMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::cudaMemAdviseSetReadMostly.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ *
+ * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation
+ * and changes the preferred location to none.
+ *
+ * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by \p device.
+ * Passing in ::cudaCpuDeviceId for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::cudaMemAdviseSetPreferredLocation will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has
+ * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * - ::cudaMemAdviseUnsetAccessedBy: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has
+ * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync,
+ * ::cuMemAdvise
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cudaMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync
+ * or ::cudaMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If the target location for ::cudaMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on
+ * another host NUMA node, that copy will be migrated to the targeted host NUMA node.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. If the writing processor is the CPU and the preferred location of
+ * the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::cudaMemAdviceUnsetReadMostly:  Undoes the effect of ::cudaMemAdviseSetReadMostly and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ * Note: The \p location argument is ignored for this advice.
+ *
+ * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the
+ * data to be the memory belonging to \p location. When ::cudaMemLocation::type is ::cudaMemLocationTypeHost,
+ * ::cudaMemLocation::id is ignored and the preferred location is set to be host memory. To set the preferred location
+ * to a specific host NUMA node, applications must set ::cudaMemLocation::type to ::cudaMemLocationTypeHostNuma and
+ * ::cudaMemLocation::id must specify the NUMA ID of the host NUMA node. If ::cudaMemLocation::type is set to ::cudaMemLocationTypeHostNumaCurrent,
+ * ::cudaMemLocation::id will be ignored and the host NUMA node closest to the calling thread's CPU will be used as the preferred location.
+ * If ::cudaMemLocation::type is a ::cudaMemLocationTypeDevice, then ::cudaMemLocation::id must be a valid device ordinal
+ * and the device must have a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * Setting the preferred location does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cudaMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p location will not result in a read-only copy being created on that procesor as outlined in description for
+ * the advice ::cudaMemAdviseSetReadMostly.
+ * If the memory region refers to valid system-allocated pageable memory, and ::cudaMemLocation::type is ::cudaMemLocationTypeDevice
+ * then ::cudaMemLocation::id must be a valid device that has a non-zero alue for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ *
+ * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation
+ * and changes the preferred location to none. The \p location argument is ignored for this advice.
+ *
+ * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by processor \p location.
+ * The ::cudaMemLocation::type must be either ::cudaMemLocationTypeDevice with ::cudaMemLocation::id representing a valid device
+ * ordinal or ::cudaMemLocationTypeHost and ::cudaMemLocation::id will be ignored. All other location types are invalid.
+ * If ::cudaMemLocation::id is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::cudaMemAdviseSetAccessedBy flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p location, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, and ::cudaMemLocation::type is ::cudaMemLocationTypeDevice
+ * then device in ::cudaMemLocation::id must have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ * Additionally, if ::cudaMemLocation::id has a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to
+ * the data from \p location may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, and ::cudaMemLocation::type is ::cudaMemLocationTypeDevice
+ * then device in ::cudaMemLocation::id must have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess.
+ * Additionally, if ::cudaMemLocation::id has a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * \param devPtr   - Pointer to memory to set the advice for
+ * \param count    - Size in bytes of the memory range
+ * \param advice   - Advice to be applied for the specified memory range
+ * \param location - location to apply the advice for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync,
+ * ::cuMemAdvise, ::cuMemAdvise_v2
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemAdvise_v2(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, struct cudaMemLocation location);
+
+/**
+* \brief Query an attribute of a given memory range
+*
+* Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+* memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
+* __managed__ variables.
+*
+* The \p attribute parameter can take the following values:
+* - ::cudaMemRangeAttributeReadMostly: If this attribute is specified, \p data will be interpreted
+* as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+* memory range have read-duplication enabled, or 0 otherwise.
+* - ::cudaMemRangeAttributePreferredLocation: If this attribute is specified, \p data will be
+* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+* id if all pages in the memory range have that GPU as their preferred location, or it will be cudaCpuDeviceId
+* if all pages in the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId
+* if either all the pages don't have the same preferred location or some of the pages don't have a
+* preferred location at all. Note that the actual location of the pages in the memory range at the time of
+* the query may be different from the preferred location.
+* - ::cudaMemRangeAttributeAccessedBy: If this attribute is specified, \p data will be interpreted
+* as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+* will be a list of device ids that had ::cudaMemAdviceSetAccessedBy set for that entire memory range.
+* If any device does not have that advice set for the entire memory range, that device will not be included.
+* If \p data is larger than the number of devices that have that advice set for that memory range,
+* cudaInvalidDeviceId will be returned in all the extra space provided. For ex., if \p dataSize is 12
+* (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+* { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If \p data is smaller than the number of devices that have
+* that advice set, then only as many devices will be returned as can fit in the array. There is no
+* guarantee on which specific devices will be returned, however.
+* - ::cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, \p data will be
+* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+* to which all pages in the memory range were prefetched explicitly via ::cudaMemPrefetchAsync. This will either be
+* a GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was a GPU or the CPU
+* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+* prefetched to the same location, cudaInvalidDeviceId will be returned. Note that this simply returns the
+* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
+* whether the prefetch operation to that location has completed or even begun.
+ * - ::cudaMemRangeAttributePreferredLocationType: If this attribute is specified, \p data will be
+ * interpreted as a ::cudaMemLocationType, and \p dataSize must be sizeof(cudaMemLocationType). The ::cudaMemLocationType returned will be
+ * ::cudaMemLocationTypeDevice if all pages in the memory range have the same GPU as their preferred location, or ::cudaMemLocationType
+ * will be ::cudaMemLocationTypeHost if all pages in the memory range have the CPU as their preferred location, or or it will be ::cudaMemLocationTypeHostNuma
+ * if all the pages in the memory range have the same host NUMA node ID as their preferred location or it will be ::cudaMemLocationTypeInvalid
+ * if either all the pages don't have the same preferred location or some of the pages don't have a preferred location at all.
+ * Note that the actual location type of the pages in the memory range at the time of the query may be different from the preferred location type.
+ *  - ::cudaMemRangeAttributePreferredLocationId: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::cudaMemRangeAttributePreferredLocationType query for the same address range
+ * returns ::cudaMemLocationTypeDevice, it will be a valid device ordinal or if it returns ::cudaMemLocationTypeHostNuma, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+ * - ::cudaMemRangeAttributeLastPrefetchLocationType: If this attribute is specified, \p data will be
+ * interpreted as a ::cudaMemLocationType, and \p dataSize must be sizeof(cudaMemLocationType). The result returned will be the last location type
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. The ::cudaMemLocationType returned
+ * will be ::cudaMemLocationTypeDevice if the last prefetch location was the GPU or ::cudaMemLocationTypeHost if it was the CPU or ::cudaMemLocationTypeHostNuma if
+ * the last prefetch location was a specific host NUMA node. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, ::CUmemLocationType will be ::cudaMemLocationTypeInvalid.
+ * Note that this simply returns the last location type that the application requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ *  - ::cudaMemRangeAttributeLastPrefetchLocationId: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::cudaMemRangeAttributeLastPrefetchLocationType query for the same address range
+ * returns ::cudaMemLocationTypeDevice, it will be a valid device ordinal or if it returns ::cudaMemLocationTypeHostNuma, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+*
+* \param data      - A pointers to a memory location where the result
+*                    of each attribute query will be written to.
+* \param dataSize  - Array containing the size of data
+* \param attribute - The attribute to query
+* \param devPtr    - Start of the range to query
+* \param count     - Size of the range to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemRangeGetAttributes, ::cudaMemPrefetchAsync,
+ * ::cudaMemAdvise,
+ * ::cuMemRangeGetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cudaMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::cudaMemRangeAttributeReadMostly
+ * - ::cudaMemRangeAttributePreferredLocation
+ * - ::cudaMemRangeAttributeAccessedBy
+ * - ::cudaMemRangeAttributeLastPrefetchLocation
+ * - :: cudaMemRangeAttributePreferredLocationType
+ * - :: cudaMemRangeAttributePreferredLocationId
+ * - :: cudaMemRangeAttributeLastPrefetchLocationType
+ * - :: cudaMemRangeAttributeLastPrefetchLocationId
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemRangeGetAttribute, ::cudaMemAdvise,
+ * ::cudaMemPrefetchAsync,
+ * ::cuMemRangeGetAttributes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count);
+
+/** @} */ /* END CUDART_MEMORY */
+
+/**
+ * \defgroup CUDART_MEMORY_DEPRECATED Memory Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated memory management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated memory management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from
+ * the upper left corner, where \p kind specifies the direction
+ * of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyHtoA,
+ * ::cuMemcpyDtoA
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows
+ * and \p wOffset bytes from the upper left corner to the memory area pointed to
+ * by \p dst, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst     - Destination memory address
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoH,
+ * ::cuMemcpyAtoD
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffsetSrc
+ * rows and \p wOffsetSrc bytes from the upper left corner to the CUDA array
+ * \p dst starting at \p hOffsetDst rows and \p wOffsetDst bytes from the upper
+ * left corner, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst        - Destination memory address
+ * \param wOffsetDst - Destination starting X offset (columns in bytes)
+ * \param hOffsetDst - Destination starting Y offset (rows)
+ * \param src        - Source memory address
+ * \param wOffsetSrc - Source starting X offset (columns in bytes)
+ * \param hOffsetSrc - Source starting Y offset (rows)
+ * \param count      - Size in bytes to copy
+ * \param kind       - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoA
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from
+ * the upper left corner, where \p kind specifies the
+ * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If \p
+ * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyHtoAAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows
+ * and \p wOffset bytes from the upper left corner to the memory area pointed to
+ * by \p dst, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If \p
+ * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoHAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/** @} */ /* END CUDART_MEMORY_DEPRECATED */
+
+/**
+ * \defgroup CUDART_MEMORY_POOLS Stream Ordered Memory Allocator 
+ *
+ * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
+ *                Functions for controlling the behavior of the underlying allocator.
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ * 
+ *
+ * @{
+ *
+ * \section CUDART_MEMORY_POOLS_overview overview
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between
+ * the stream executions of the allocation and the free. If the memory is accessed
+ * outside of the promised stream order, a use before allocation / use after free error
+ * will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee
+ * that compliant memory accesses will not overlap temporally.
+ * The allocator may refer to internal stream ordering as well as inter-stream dependencies
+ * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
+ * The allocator may also insert inter-stream dependencies to establish the temporal guarantee.
+ *
+ * \section CUDART_MEMORY_POOLS_support Supported Platforms
+ *
+ * Whether or not a device supports the integrated stream ordered memory allocator
+ * may be queried by calling ::cudaDeviceGetAttribute() with the device attribute
+ * ::cudaDevAttrMemoryPoolsSupported.
+ */
+
+/**
+ * \brief Allocates memory with stream ordered semantics
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the memory pool associated with the stream's device.
+ *
+ * \note The default memory pool of a device contains device memory from that device.
+ * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs.
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] devPtr  - Returned device pointer
+ * \param[in] size     - Number of bytes to allocate
+ * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorOutOfMemory,
+ * \notefnerr
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuMemAllocAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)  "cudaMallocAsync (C++ API)", 
+ * ::cudaMallocFromPoolAsync, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute, ::cudaMemPoolGetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream);
+
+/**
+ * \brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into \p hStream.
+ * The allocation must not be accessed after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * \note During stream capture, this function results in the creation of a free node and
+ *       must therefore be passed the address of a graph allocation.
+ *
+ * \param dptr - memory to free
+ * \param hStream - The stream establishing the stream ordering promise
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuMemFreeAsync, ::cudaMallocAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream);
+
+/**
+ * \brief Tries to release memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * \note: Allocations that have not been freed count as outstanding.
+ * \note: Allocations that have been asynchronously freed but whose completion has
+ *        not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * \param[in] pool           - The memory pool to trim
+ * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ * at least minBytesToKeep bytes reserved after the operation.
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolTrimTo, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep);
+
+/**
+ * \brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cudaFreeAsync (default enabled).
+ * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of backing memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of used memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ *
+ * \param[in] pool  - The memory pool to modify
+ * \param[in] attr  - The attribute to modify
+ * \param[in] value - Pointer to the value to assign
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolSetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+/**
+ * \brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cudaFreeAsync (default enabled).
+ * - ::cudaMemPoolAttrReservedMemCurrent: (value type = cuuint64_t)
+ *                    Amount of backing memory currently allocated for the mempool.
+ * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t)
+ *                    High watermark of backing memory allocated for the mempool since
+ *                    the last time it was reset.
+ * - ::cudaMemPoolAttrUsedMemCurrent: (value type = cuuint64_t)
+ *                    Amount of memory from the pool that is currently in use by the application.
+ * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t)
+ *                    High watermark of the amount of memory from the pool that was in use by the
+ *                    application since the last time it was reset.
+ *
+ * \param[in] pool  - The memory pool to get attributes of 
+ * \param[in] attr  - The attribute to get
+ * \param[in] value - Retrieved value 
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolGetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+/**
+ * \brief Controls visibility of pools between devices
+ *
+ * \param[in] pool  - The pool being modified
+ * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu
+ * \param[in] count - Number of descriptors in the map array.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa ::cuMemPoolSetAccess, ::cudaMemPoolGetAccess, ::cudaMallocAsync, cudaFreeAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAccessDesc *descList, size_t count);
+
+/**
+ * \brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location.
+ *
+ * \param[out] flags   - the accessibility of the pool from the specified location
+ * \param[in] memPool  - the pool being queried
+ * \param[in] location - the location accessing the pool
+ *
+ * \sa ::cuMemPoolGetAccess, ::cudaMemPoolSetAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool, struct cudaMemLocation *location);
+
+/**
+ * \brief Creates a memory pool
+ *
+ * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ * the properties of the pool such as the backing device and IPC capabilities.
+ *
+* To create a memory pool targeting a specific host NUMA node, applications must
+* set ::cudaMemPoolProps::cudaMemLocation::type to ::cudaMemLocationTypeHostNuma and
+* ::cudaMemPoolProps::cudaMemLocation::id must specify the NUMA ID of the host memory node.
+* By default, the pool's memory will be accessible from the device it is allocated on.
+ * In the case of pools created with ::cudaMemLocationTypeHostNuma, their default accessibility
+ * will be from the host CPU.
+ * Applications can control the maximum size of the pool by specifying a non-zero value for ::cudaMemPoolProps::maxSize.
+ * If set to 0, the maximum size of the pool will default to a system dependent value.
+ *
+ * Applications can set ::cudaMemPoolProps::handleTypes to ::cudaMemHandleTypeFabric
+ * in order to create ::cudaMemPool_t suitable for sharing within an IMEX domain.
+ * An IMEX domain is either an OS instance or a group of securely connected OS instances
+ * using the NVIDIA IMEX daemon. An IMEX channel is a global resource within the IMEX domain
+ * that represents a logical entity that aims to provide fine grained accessibility control
+ * for the participating processes. When exporter and importer CUDA processes have been
+ * granted access to the same IMEX channel, they can securely share memory.
+ * If the allocating process does not have access setup for an IMEX channel, attempting to export
+ * a ::CUmemoryPool with ::cudaMemHandleTypeFabric will result in ::cudaErrorNotPermitted.
+ * The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
+ *
+ * \note Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ *
+ * \sa ::cuMemPoolCreate, ::cudaDeviceSetMemPool, ::cudaMallocFromPoolAsync, ::cudaMemPoolExportToShareableHandle, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps);
+
+/**
+ * \brief Destroys the specified memory pool 
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when ::cudaMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations.
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * \note A device's default memory pool cannot be destroyed.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa cuMemPoolDestroy, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolDestroy(cudaMemPool_t memPool);
+
+/**
+ * \brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * \note
+ *    -  The specified memory pool may be from a device different than that of the specified \p hStream.
+ *
+ *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs.
+ *
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] ptr     - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] memPool  - The pool to allocate from
+ * \param[in] stream   - The stream establishing the stream ordering semantic
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemAllocFromPoolAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)  "cudaMallocAsync (C++ API)", 
+ * ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream);
+
+/**
+ * \brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with ::cudaMemPoolImportFromShareableHandle.
+ * Individual pointers can then be shared with the ::cudaMemPoolExportPointer and ::cudaMemPoolImportPointer APIs.
+ * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ * handle type.
+ *
+ * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than cudaMemHandleTypeNone.
+ *
+ * \param[out] handle_out  - pointer to the location in which to store the requested handle 
+ * \param[in] pool         - pool to export
+ * \param[in] handleType   - the type of handle to create
+ * \param[in] flags        - must be 0
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
+    void                            *shareableHandle,
+    cudaMemPool_t                    memPool,
+    enum cudaMemAllocationHandleType handleType,
+    unsigned int                     flags);
+
+/**
+ * \brief imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with ::cudaMemPoolImportPointer.
+ *
+ * \note Imported memory pools do not support creating new allocations.
+ *       As such imported memory pools may not be used in ::cudaDeviceSetMemPool
+ *       or ::cudaMallocFromPoolAsync calls.
+ *
+ * \param[out] pool_out    - Returned memory pool
+ * \param[in] handle       - OS handle of the pool to open
+ * \param[in] handleType   - The type of handle being imported
+ * \param[in] flags        - must be 0
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolImportFromShareableHandle, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
+    cudaMemPool_t                   *memPool,
+    void                            *shareableHandle,
+    enum cudaMemAllocationHandleType handleType,
+    unsigned int                     flags);
+
+/**
+ * \brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the ::cudaMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * \param[out] shareData_out - Returned export data
+ * \param[in] ptr            - pointer to memory being exported
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolExportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(struct cudaMemPoolPtrExportData *exportData, void *ptr);
+
+/**
+ * \brief Import a memory pool allocation from another process.
+ *
+ * Returns in \p ptr_out a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with cudaFree
+ * or cudaFreeAsync.  If ::cudaFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * \note The ::cudaFreeAsync api may be used in the exporting process before
+ *       the ::cudaFreeAsync operation completes in its stream as long as the
+ *       ::cudaFreeAsync in the exporting process specifies a stream with
+ *       a stream dependency on the importing process's ::cudaFreeAsync.
+ *
+ * \param[out] ptr_out  - pointer to imported memory
+ * \param[in] pool      - pool from which to import
+ * \param[in] shareData - data specifying the memory to import
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolImportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool, struct cudaMemPoolPtrExportData *exportData);
+
+/** @} */ /* END CUDART_MEMORY_POOLS */
+
+/**
+ * \defgroup CUDART_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the CUDA 
+ * runtime application programming interface.
+ *
+ * @{
+ *
+ * \section CUDART_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.  
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be 
+ * used to access memory from the host program and from a kernel 
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDART_UNIFIED_support Supported Platforms
+ * 
+ * Whether or not a device supports unified addressing may be 
+ * queried by calling ::cudaGetDeviceProperties() with the device 
+ * property ::cudaDeviceProp::unifiedAddressing.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes .
+ *
+ * \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a 
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device 
+ * memory, one may want to know on which CUDA device the memory 
+ * resides.  These properties may be queried using the function 
+ * ::cudaPointerGetAttributes()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to ::cudaMemcpy() and other copy functions.  
+ * The copy direction ::cudaMemcpyDefault may be used to specify that the 
+ * CUDA runtime should infer the location of the pointer from its value.
+ *
+ * \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated through all devices using ::cudaMallocHost() and
+ * ::cudaHostAlloc() is always directly accessible from all devices that 
+ * support unified addressing.  This is the case regardless of whether or 
+ * not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are 
+ * specified.
+ *
+ * The pointer value through which allocated host memory may be accessed 
+ * in kernels on all devices that support unified addressing is the same 
+ * as the pointer value through which that memory is accessed on the host.
+ * It is not necessary to call ::cudaHostGetDevicePointer() to get the device 
+ * pointer for these allocations.  
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::cudaHostAllocWriteCombined, as discussed below.
+ *
+ * \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory
+ 
+ * Upon enabling direct access from a device that supports unified addressing 
+ * to another peer device that supports unified addressing using 
+ * ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using 
+ * ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible 
+ * by the current device.  The device pointer value through 
+ * which any peer's memory may be accessed in the current device 
+ * is the same pointer value through which that memory may be 
+ * accessed from the peer device. 
+ *
+ * \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ * 
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cudaHostRegister() and host memory
+ * allocated using the flag ::cudaHostAllocWriteCombined.  For these 
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all devices
+ * that support unified addressing.  
+ * 
+ * This device address may be queried using ::cudaHostGetDevicePointer() 
+ * when a device using unified addressing is current.  Either the host 
+ * or the unified device pointer value may be used to refer to this memory 
+ * in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault 
+ * memory direction.
+ *
+ */
+
+/**
+ * \brief Returns attributes about a specified pointer
+ *
+ * Returns in \p *attributes the attributes of the pointer \p ptr.
+ * If pointer was not allocated in, mapped by or registered with context
+ * supporting unified addressing ::cudaErrorInvalidValue is returned.
+ *
+ * \note In CUDA 11.0 forward passing host pointer will return ::cudaMemoryTypeUnregistered
+ * in ::cudaPointerAttributes::type and call will return ::cudaSuccess.
+ *
+ * The ::cudaPointerAttributes structure is defined as:
+ * \code
+    struct cudaPointerAttributes {
+        enum cudaMemoryType type;
+        int device;
+        void *devicePointer;
+        void *hostPointer;
+    }
+    \endcode
+ * In this structure, the individual fields mean
+ *
+ * - \ref ::cudaPointerAttributes::type identifies type of memory. It can be
+ *    ::cudaMemoryTypeUnregistered for unregistered host memory,
+ *    ::cudaMemoryTypeHost for registered host memory, ::cudaMemoryTypeDevice for device
+ *    memory or  ::cudaMemoryTypeManaged for managed memory.
+ *
+ * - \ref ::cudaPointerAttributes::device "device" is the device against which
+ *   \p ptr was allocated.  If \p ptr has memory type ::cudaMemoryTypeDevice
+ *   then this identifies the device on which the memory referred to by \p ptr
+ *   physically resides.  If \p ptr has memory type ::cudaMemoryTypeHost then this
+ *   identifies the device which was current when the allocation was made
+ *   (and if that device is deinitialized then this allocation will vanish
+ *   with that device's state).
+ *
+ * - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is
+ *   the device pointer alias through which the memory referred to by \p ptr
+ *   may be accessed on the current device.
+ *   If the memory referred to by \p ptr cannot be accessed directly by the 
+ *   current device then this is NULL.  
+ *
+ * - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is
+ *   the host pointer alias through which the memory referred to by \p ptr
+ *   may be accessed on the host.
+ *   If the memory referred to by \p ptr cannot be accessed directly by the
+ *   host then this is NULL.
+ *
+ * \param attributes - Attributes for the specified pointer
+ * \param ptr        - Pointer to get attributes for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
+ * ::cudaChooseDevice,
+ * ::cudaInitDevice,
+ * ::cuPointerGetAttributes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr);
+
+/** @} */ /* END CUDART_UNIFIED */
+
+/**
+ * \defgroup CUDART_PEER Peer Device Memory Access
+ *
+ * ___MANBRIEF___ peer device memory access functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the peer device memory access functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if device \p device is capable of
+ * directly accessing memory from \p peerDevice and 0 otherwise.  If direct
+ * access of \p peerDevice from \p device is possible, then access may be
+ * enabled by calling ::cudaDeviceEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param device        - Device from which allocations on \p peerDevice are to
+ *                        be directly accessed.
+ * \param peerDevice    - Device on which the allocations to be directly accessed 
+ *                        by \p device reside.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);
+
+/**
+ * \brief Enables direct access to memory allocations on a peer device.
+ *
+ * On success, all allocations from \p peerDevice will immediately be accessible by
+ * the current device.  They will remain accessible until access is explicitly
+ * disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using
+ * ::cudaDeviceReset().
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory on the current device from \p peerDevice, a separate symmetric call 
+ * to ::cudaDeviceEnablePeerAccess() is required.
+ *
+ * Note that there are both device-wide and system-wide limitations per system
+ * configuration, as noted in the CUDA Programming Guide under the section
+ * "Peer-to-Peer Memory Access".
+ *
+ * Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates
+ * that the current device cannot directly access memory from \p peerDevice.
+ *
+ * Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of
+ * \p peerDevice from the current device has already been enabled.
+ *
+ * Returns ::cudaErrorInvalidValue if \p flags is not 0.
+ *
+ * \param peerDevice  - Peer device to enable direct access to from the current device
+ * \param flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorPeerAccessAlreadyEnabled,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceCanAccessPeer,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cuCtxEnablePeerAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);
+
+/**
+ * \brief Disables direct access to memory allocations on a peer device.
+ *
+ * Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on
+ * \p peerDevice has not yet been enabled from the current device.
+ *
+ * \param peerDevice - Peer device to disable direct access to
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorPeerAccessNotEnabled,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceCanAccessPeer,
+ * ::cudaDeviceEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);
+
+/** @} */ /* END CUDART_PEER */
+
+/** \defgroup CUDART_OPENGL OpenGL Interoperability */
+
+/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */
+
+/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */
+
+/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */
+
+/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_VDPAU VDPAU Interoperability */
+
+/** \defgroup CUDART_EGL EGL Interoperability */
+
+/**
+ * \defgroup CUDART_INTEROP Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::cudaErrorInvalidResourceHandle is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphicsD3D9RegisterResource,
+ * ::cudaGraphicsD3D10RegisterResource,
+ * ::cudaGraphicsD3D11RegisterResource,
+ * ::cudaGraphicsGLRegisterBuffer,
+ * ::cudaGraphicsGLRegisterImage,
+ * ::cuGraphicsUnregisterResource
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource);
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will
+ *     be used. It is therefore assumed that CUDA may read from or write to \p resource.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will
+ *   write over the entire contents of \p resource, so none of the data
+ *   previously stored in \p resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
+ * If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cuGraphicsResourceSetMapFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
+ * is returned. If any of \p resources are presently mapped for access by
+ * CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count     - Number of resources to map
+ * \param resources - Resources to map for CUDA
+ * \param stream    - Stream for synchronization
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsUnmapResources,
+ * ::cuGraphicsMapResources
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cudaGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
+ * is returned. If any of \p resources are not presently mapped for access by
+ * CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count     - Number of resources to unmap
+ * \param resources - Resources to unmap
+ * \param stream    - Stream for synchronization
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cuGraphicsUnmapResources
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Get an device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *devPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p *size the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p devPtr may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::cudaErrorUnknown is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ * *
+ * \param devPtr     - Returned pointer through which \p resource may be accessed
+ * \param size       - Returned size of the buffer accessible starting at \p *devPtr
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *array an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p array may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::cudaErrorUnknown is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::cudaErrorInvalidValue is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::cudaErrorInvalidValue is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * \param array       - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::cudaGraphicsCubeFace for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *mipmappedArray a mipmapped array through which the mapped
+ * graphics resource \p resource may be accessed. The value set in \p mipmappedArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::cudaErrorUnknown is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * \param mipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource       - Mapped resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsResourceGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource);
+
+/** @} */ /* END CUDART_INTEROP */
+
+/**
+ * \defgroup CUDART_TEXTURE_OBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level texture object management functions
+ * of the CUDA runtime application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Get the channel descriptor of an array
+ *
+ * Returns in \p *desc the channel descriptor of the CUDA array \p array.
+ *
+ * \param desc  - Channel format
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaCreateTextureObject, ::cudaCreateSurfaceObject
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array);
+
+/**
+ * \brief Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * \param x - X component
+ * \param y - Y component
+ * \param z - Z component
+ * \param w - W component
+ * \param f - Channel format
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaCreateTextureObject, ::cudaCreateSurfaceObject
+ */
+extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::cudaResourceDesc structure is defined as:
+ * \code
+        struct cudaResourceDesc {
+            enum cudaResourceType resType;
+            
+            union {
+                struct {
+                    cudaArray_t array;
+                } array;
+                struct {
+                    cudaMipmappedArray_t mipmap;
+                } mipmap;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceDesc::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        enum cudaResourceType {
+            cudaResourceTypeArray          = 0x00,
+            cudaResourceTypeMipmappedArray = 0x01,
+            cudaResourceTypeLinear         = 0x02,
+            cudaResourceTypePitch2D        = 0x03
+        };
+ * \endcode
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap
+ * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed 
+ * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)).
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width
+ * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively.
+ * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to 
+ * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2].
+ *
+ *
+ * The ::cudaTextureDesc struct is defined as
+ * \code
+        struct cudaTextureDesc {
+            enum cudaTextureAddressMode addressMode[3];
+            enum cudaTextureFilterMode  filterMode;
+            enum cudaTextureReadMode    readMode;
+            int                         sRGB;
+            float                       borderColor[4];
+            int                         normalizedCoords;
+            unsigned int                maxAnisotropy;
+            enum cudaTextureFilterMode  mipmapFilterMode;
+            float                       mipmapLevelBias;
+            float                       minMipmapLevelClamp;
+            float                       maxMipmapLevelClamp;
+            int                         disableTrilinearOptimization;
+            int                         seamlessCubemap;
+        };
+ * \endcode
+ * where
+ * - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as:
+ *   \code
+        enum cudaTextureAddressMode {
+            cudaAddressModeWrap   = 0,
+            cudaAddressModeClamp  = 1,
+            cudaAddressModeMirror = 2,
+            cudaAddressModeBorder = 3
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords
+ *   is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp.
+ *
+ * - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as:
+ *   \code
+        enum cudaTextureFilterMode {
+            cudaFilterModePoint  = 0,
+            cudaFilterModeLinear = 1
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear.
+ *
+ * - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as:
+ *   \code
+        enum cudaTextureReadMode {
+            cudaReadModeElementType     = 0,
+            cudaReadModeNormalizedFloat = 1
+        };
+ *   \endcode
+ *   Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of 
+ *   whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified.
+ *
+ * - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch.
+ *
+ * - ::cudaTextureDesc::borderColor specifies the float values of color. where:
+ *   ::cudaTextureDesc::borderColor[0] contains value of 'R', 
+ *   ::cudaTextureDesc::borderColor[1] contains value of 'G',
+ *   ::cudaTextureDesc::borderColor[2] contains value of 'B', 
+ *   ::cudaTextureDesc::borderColor[3] contains value of 'A'
+ *   Note that application using integer border color values will need to <reinterpret_cast> these values to float.
+ *   The values are set only when the addressing mode specified by ::cudaTextureDesc::addressMode is cudaAddressModeBorder.
+ *
+ * - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not.
+ *
+ * - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc::disableTrilinearOptimization specifies whether the trilinear filtering optimizations will be disabled.
+ *
+ * - ::cudaTextureDesc::seamlessCubemap specifies whether seamless cube map filtering is enabled. This flag can only be specified if the 
+ *   underlying resource is a CUDA array or a CUDA mipmapped array that was created with the flag ::cudaArrayCubemap.
+ *   When seamless cube map filtering is enabled, texture address modes specified by ::cudaTextureDesc::addressMode are ignored.
+ *   Instead, if the ::cudaTextureDesc::filterMode is set to ::cudaFilterModePoint the address mode ::cudaAddressModeClamp will be applied for all dimensions.
+ *   If the ::cudaTextureDesc::filterMode is set to ::cudaFilterModeLinear seamless cube map filtering will be performed when sampling along the cube face borders.
+ *
+ * The ::cudaResourceViewDesc struct is defined as
+ * \code
+        struct cudaResourceViewDesc {
+            enum cudaResourceViewFormat format;
+            size_t                      width;
+            size_t                      height;
+            size_t                      depth;
+            unsigned int                firstMipmapLevel;
+            unsigned int                lastMipmapLevel;
+            unsigned int                firstLayer;
+            unsigned int                lastLayer;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int
+ *   format but with 4 channels.
+ *
+ * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, 
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDestroyTextureObject,
+ * ::cuTexObjectCreate
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetResourceDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetTextureDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was specified, ::cudaErrorInvalidValue is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetResourceViewDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject);
+
+/** @} */ /* END CUDART_TEXTURE_OBJECT */
+
+/**
+ * \defgroup CUDART_SURFACE_OBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level texture object management functions
+ * of the CUDA runtime application programming interface. The surface object 
+ * API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::cudaResourceDesc::resType must be 
+ * ::cudaResourceTypeArray and  ::cudaResourceDesc::res::array::array
+ * must be set to a valid CUDA array handle.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDestroySurfaceObject,
+ * ::cuSurfObjectCreate
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaCreateSurfaceObject,
+ * ::cuSurfObjectDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateSurfaceObject,
+ * ::cuSurfObjectGetResourceDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject);
+
+/** @} */ /* END CUDART_SURFACE_OBJECT */
+
+/**
+ * \defgroup CUDART__VERSION Version Management
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest version of CUDA supported by the driver
+ *
+ * Returns in \p *driverVersion the latest version of CUDA supported by
+ * the driver. The version is returned as (1000 &times; major + 10 &times; minor).
+ * For example, CUDA 9.2 would be represented by 9020. If no driver is installed,
+ * then 0 is returned as the driver version.
+ *
+ * This function automatically returns ::cudaErrorInvalidValue
+ * if \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaRuntimeGetVersion,
+ * ::cuDriverGetVersion
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
+
+/**
+ * \brief Returns the CUDA Runtime version
+ *
+ * Returns in \p *runtimeVersion the version number of the current CUDA
+ * Runtime instance. The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example,
+ * CUDA 9.2 would be represented by 9020.
+ *
+ * As of CUDA 12.0, this function no longer initializes CUDA. The purpose
+ * of this API is solely to return a compile-time constant stating the
+ * CUDA Toolkit version in the above format.
+ *
+ * This function automatically returns ::cudaErrorInvalidValue if
+ * the \p runtimeVersion argument is NULL.
+ *
+ * \param runtimeVersion - Returns the CUDA Runtime version.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cuDriverGetVersion
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+
+/** @} */ /* END CUDART__VERSION */
+
+/**
+ * \defgroup CUDART_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p pGraph.
+ *
+ * \param pGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphDestroy,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * The cudaKernelNodeParams structure is defined as:
+ *
+ * \code
+ *  struct cudaKernelNodeParams
+ *  {
+ *      void* func;
+ *      dim3 gridDim;
+ *      dim3 blockDim;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *  };
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDim.x x
+ * \p gridDim.y x \p gridDim.z) grid of blocks. Each block contains
+ * (\p blockDim.x x \p blockDim.y x \p blockDim.z) threads.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in
+ * via \p extra. This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::cudaErrorInvalidValue will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and
+ * \p extra are non-NULL).
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pNodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaLaunchKernel,
+ * ::cudaGraphKernelNodeGetParams,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p node in \p pNodeParams.
+ * The \p kernelParams or \p extra array returned in \p pNodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node \p src to destination node \p dst.
+ * Both node must have the same context.
+ *
+ * \param[out] dst Destination node
+ * \param[in] src Source node
+ * For list of attributes see ::cudaKernelNodeAttrID
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidContext
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeCopyAttributes(
+        cudaGraphNode_t hSrc,
+        cudaGraphNode_t hDst);
+
+/**
+ * \brief Queries node attribute.
+ *
+ * Queries attribute \p attr from node \p hNode and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hNode
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode,
+    cudaKernelNodeAttrID attr,
+    cudaKernelNodeAttrValue *value_out);
+
+/**
+ * \brief Sets node attribute.
+ *
+ * Sets attribute \p attr on node \p hNode from corresponding attribute of
+ * \p value.
+ *
+ * \param[out] hNode
+ * \param[in] attr
+ * \param[out] value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode,
+    cudaKernelNodeAttrID attr,
+    const cudaKernelNodeAttrValue *value);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p pCopyParams.
+ * See ::cudaMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pCopyParams      - Parameters for the memory copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaMemcpy3D,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphAddMemcpyNode1D,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms *pCopyParams);
+
+/**
+ * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t* pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t* pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a 1D memcpy node and adds it to a graph
+ *
+ * Creates a new 1D memcpy node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Launching a
+ * memcpy node with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy3D,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaMemcpy3D,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters to copy to a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets a memcpy node's parameters to copy from a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets a memcpy node's parameters to perform a 1-dimensional copy
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Launching a
+ * memcpy node with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams1D(
+    cudaGraphNode_t node,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p pMemsetParams.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pMemsetParams    - Parameters for the memory set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaMemset2D,
+ * ::cudaGraphMemsetNodeGetParams,
+ * ::cudaGraphMemsetNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemsetParams *pMemsetParams);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemset2D,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaMemset2D,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pNodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphHostNodeGetParams,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p node to \p nodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * If \p hGraph contains allocation or free nodes, this call will return an error.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaGraph_t childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * Allocation and free nodes cannot be added to the returned graph.
+ * Attempting to do so will return an error.
+ *
+ * \param node   - Node to get the embedded graph for
+ * \param pGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphNodeFindInClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies);
+
+/**
+ * \brief Creates an event record node and adds it to a graph
+ *
+ * Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Each launch of the graph will record \p event to capture execution of the
+ * node's dependencies.
+ *
+ * These nodes may not be used in loops or conditionals.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Returns the event associated with an event record node
+ *
+ * Returns the event of event record node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+#endif
+
+/**
+ * \brief Sets an event record node's event
+ *
+ * Sets the event of event record node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Creates an event wait node and adds it to a graph
+ *
+ * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ * for details on what is captured by an event.  The synchronization will be performed
+ * efficiently on the device when applicable.  \p event may be from a different context
+ * or device than the launch stream.
+ *
+ * These nodes may not be used in loops or conditionals.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Returns the event associated with an event wait node
+ *
+ * Returns the event of event wait node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+#endif
+
+/**
+ * \brief Sets an event wait node's event
+ *
+ * Sets the event of event wait node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Creates an external semaphore signal node and adds it to a graph
+ *
+ * Creates a new external semaphore signal node and adds it to \p graph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified
+ * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ * node will be placed at the root of the graph. \p dependencies may not have any
+ * duplicate entries. A handle to the new node will be returned in \p pGraphNode.
+ *
+ * Performs a signal operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The operation(s) will occur after all of the node's
+ * dependencies have completed.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeGetParams,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns an external semaphore signal node's parameters
+ *
+ * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreSignalNodeParams *params_out);
+#endif
+
+/**
+ * \brief Sets an external semaphore signal node's parameters
+ *
+ * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Creates an external semaphore wait node and adds it to a graph
+ *
+ * Creates a new external semaphore wait node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * Performs a wait operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The node's dependencies will not be launched until
+ * the wait operation has completed.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeGetParams,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns an external semaphore wait node's parameters
+ *
+ * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreWaitNodeParams *params_out);
+#endif
+
+/**
+ * \brief Sets an external semaphore wait node's parameters
+ *
+ * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Creates an allocation node and adds it to a graph
+ *
+ * Creates a new allocation node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * When ::cudaGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+ *
+ * If the allocation is freed in the same graph, by creating a free node using ::cudaGraphAddMemFreeNode,
+ * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ * owning graph.
+ *
+ * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ * graph which are ordered after the allocation node, but also by stream operations ordered after the
+ * graph's execution but before the allocation is freed.
+ *
+ * Allocations which are not freed in the same graph can be freed by:
+ * - passing the allocation to ::cudaMemFreeAsync or ::cudaMemFree;
+ * - launching a graph with a free node for that allocation; or
+ * - specifying ::cudaGraphInstantiateFlagAutoFreeOnLaunch during instantiation, which makes
+ *   each launch behave as though it called ::cudaMemFreeAsync for every unfreed allocation.
+ *
+ * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ * in another graph, a free node can no longer be added to the owning graph.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaGraphMemAllocNodeGetParams,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, struct cudaMemAllocNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns a memory alloc node's parameters
+ *
+ * Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ * node.  This memory remains valid until the node is destroyed.  The returned
+ * parameters must not be modified.
+ *
+ * \param node       - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphMemFreeNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out);
+#endif
+
+/**
+ * \brief Creates a memory free node and adds it to a graph
+ *
+ * Creates a new memory free node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and address specified in \p dptr.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dptr            - Address of memory to free
+ *
+ * ::cudaGraphAddMemFreeNode will return ::cudaErrorInvalidValue if the user attempts to free:
+ * - an allocation twice in the same graph.
+ * - an address that was not returned by an allocation node.
+ * - an invalid address.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphMemFreeNodeGetParams,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr);
+#endif
+
+/**
+ * \brief Returns a memory free node's parameters
+ *
+ * Returns the address of a memory free node \p hNode in \p dptr_out.
+ *
+ * \param node     - Node to get the parameters for
+ * \param dptr_out - Pointer to return the device address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaGraphMemFreeNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out);
+#endif
+
+/**
+ * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+ *
+ * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ * freed back to the operating system.
+ *
+ * \param device - The device for which cached memory should be freed.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device);
+#endif
+
+/**
+ * \brief Query asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::cudaGraphMemAttrUsedMemCurrent: Amount of memory, in bytes, currently associated with graphs
+ * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::cudaGraphMemAttrReservedMemCurrent: Amount of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - retrieved value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+#endif
+
+/**
+ * \brief Set asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - pointer to value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+#endif
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p pGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified 
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param pGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphNodeFindInClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p clonedGraph corresponding to \p originalNode 
+ * in the original graph.
+ *
+ * \p clonedGraph must have been cloned from \p originalGraph via ::cudaGraphClone. 
+ * \p originalNode must have been in \p originalGraph at the time of the call to 
+ * ::cudaGraphClone, and the corresponding cloned node in \p clonedGraph must not have 
+ * been removed. The cloned node is then returned via \p pClonedNode.
+ *
+ * \param pNode  - Returns handle to the cloned node
+ * \param originalNode - Handle to the original node
+ * \param clonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p node in \p pType.
+ *
+ * \param node - Node to query
+ * \param pType  - Pointer to return the node type
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphKernelNodeGetParams,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphHostNodeGetParams,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemsetNodeGetParams,
+ * ::cudaGraphMemsetNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p graph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param graph    - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetType,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p graph's root nodes. \p pRootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p pNumRootNodes. Otherwise,
+ * \p pNumRootNodes entries will be filled in. If \p pNumRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p pRootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumRootNodes.
+ *
+ * \param graph       - Graph to query
+ * \param pRootNodes    - Pointer to return the root nodes
+ * \param pNumRootNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetType,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p graph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param graph    - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, size_t *numEdges);
+
+/**
+ * \brief Returns a graph's dependency edges (12.3+)
+ *
+ * Returns a list of \p graph's dependency edges. Edges are returned via corresponding
+ * indices in \p from, \p to and \p edgeData; that is, the node in \p to[i] has a
+ * dependency on the node in \p from[i] with data \p edgeData[i]. \p from and \p to may
+ * both be NULL, in which case this function only returns the number of edges in
+ * \p numEdges. Otherwise, \p numEdges entries will be filled in. If \p numEdges is higher
+ * than the actual number of edges, the remaining entries in \p from and \p to will be
+ * set to NULL, and the number of edges actually returned will be written to \p numEdges.
+ * \p edgeData may alone be NULL, in which case the edges must all have default (zeroed)
+ * edge data. Attempting a losst query via NULL \p edgeData will result in
+ * ::cudaErrorLossyQuery. If \p edgeData is non-NULL then \p from and \p to must be as
+ * well.
+ *
+ * \param graph    - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param edgeData - Optional location to return edge data
+ * \param numEdges - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLossyQuery,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, cudaGraphEdgeData *edgeData, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p pDependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p pNumDependencies. Otherwise,
+ * \p pNumDependencies entries will be filled in. If \p pNumDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p pDependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumDependencies.
+ *
+ * \param node           - Node to query
+ * \param pDependencies    - Pointer to return the dependencies
+ * \param pNumDependencies - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependentNodes,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, size_t *pNumDependencies);
+
+/**
+ * \brief Returns a node's dependencies (12.3+)
+ *
+ * Returns a list of \p node's dependencies. \p pDependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p pNumDependencies. Otherwise,
+ * \p pNumDependencies entries will be filled in. If \p pNumDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p pDependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumDependencies.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::cudaErrorLossyQuery. If \p edgeData is non-NULL, then
+ * \p pDependencies must be as well.
+ *
+ * \param node             - Node to query
+ * \param pDependencies    - Pointer to return the dependencies
+ * \param edgeData         - Optional array to return edge data for each dependency
+ * \param pNumDependencies - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLossyQuery,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependentNodes,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, cudaGraphEdgeData *edgeData, size_t *pNumDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p pDependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p pNumDependentNodes.
+ * Otherwise, \p pNumDependentNodes entries will be filled in. If \p pNumDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p pDependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p pNumDependentNodes.
+ *
+ * \param node             - Node to query
+ * \param pDependentNodes    - Pointer to return the dependent nodes
+ * \param pNumDependentNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, size_t *pNumDependentNodes);
+
+/**
+ * \brief Returns a node's dependent nodes (12.3+)
+ *
+ * Returns a list of \p node's dependent nodes. \p pDependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p pNumDependentNodes.
+ * Otherwise, \p pNumDependentNodes entries will be filled in. If \p pNumDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p pDependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p pNumDependentNodes.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::cudaErrorLossyQuery. If \p edgeData is non-NULL, then
+ * \p pDependentNodes must be as well.
+ *
+ * \param node               - Node to query
+ * \param pDependentNodes    - Pointer to return the dependent nodes
+ * \param edgeData           - Optional pointer to return edge data for dependent nodes
+ * \param pNumDependentNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLossyQuery,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, cudaGraphEdgeData *edgeData, size_t *pNumDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph.
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param graph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+/**
+ * \brief Adds dependency edges to a graph. (12.3+)
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param graph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, default (zeroed) edge data is assumed.
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, const cudaGraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph.
+ *
+ * The number of \p pDependencies to be removed is defined by \p numDependencies.
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * \param graph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph. (12.3+)
+ *
+ * The number of \p pDependencies to be removed is defined by \p numDependencies.
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying an edge that does not exist in the graph, with data matching
+ * \p edgeData, results in an error. \p edgeData is nullable, which is equivalent
+ * to passing default (zeroed) data for each edge.
+ *
+ * \param graph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, edge data is assumed to
+ *                   be default (zeroed).
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, const cudaGraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p node from its graph. This operation also severs any dependencies of other nodes 
+ * on \p node and vice versa.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param node  - Node to remove
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroyNode(cudaGraphNode_t node);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p pGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::cudaGraphInstantiateFlagDeviceLaunch, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag cannot be used in
+ * conjunction with ::cudaGraphInstantiateFlagAutoFreeOnLaunch.
+ *
+ * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p graph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to
+ * instantiate a second executable graph before destroying the first with
+ * ::cudaGraphExecDestroy will result in an error.
+ * The same also applies if \p graph contains any device-updatable kernel nodes.
+ * 
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single device.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current device, and the current device must
+ *     match the device of other nodes in the graph.
+ *
+ * If \p graph is not instantiated for launch on the device but contains kernels which
+ * call device-side cudaGraphLaunch() from multiple devices, this will result in an error.
+ *
+ * \param pGraphExec - Returns instantiated graph
+ * \param graph      - Graph to instantiate
+ * \param flags      - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphCreate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags __dv(0));
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p pGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::cudaGraphInstantiateFlagDeviceLaunch, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::cudaGraphInstantiateFlagAutoFreeOnLaunch.
+ *
+ * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p graph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to
+ * instantiate a second executable graph before destroying the first with
+ * ::cudaGraphExecDestroy will result in an error.
+ * The same also applies if \p graph contains any device-updatable kernel nodes.
+ *
+ * If \p graph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * devices, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single device.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current device, and the current device must
+ *     match the device of other nodes in the graph.
+ *
+ * \param pGraphExec - Returns instantiated graph
+ * \param graph      - Graph to instantiate
+ * \param flags      - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphCreate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags __dv(0));
+#endif
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph according to the \p instantiateParams structure.
+ * The graph is validated for any structural constraints or intra-node constraints
+ * which were not previously validated. If instantiation is successful, a handle to
+ * the instantiated graph is returned in \p pGraphExec.
+ *
+ * \p instantiateParams controls the behavior of instantiation and subsequent
+ * graph launches, as well as returning more detailed information in the event of an error.
+ * ::cudaGraphInstantiateParams is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned long long flags;
+        cudaStream_t uploadStream;
+        cudaGraphNode_t errNode_out;
+        cudaGraphInstantiateResult result_out;
+    } cudaGraphInstantiateParams;
+ * \endcode
+ *
+ * The \p flags field controls the behavior of instantiation and subsequent
+ * graph launches. Valid flags are:
+ *
+ * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::cudaGraphInstantiateFlagUpload, which will perform an upload of the graph
+ * into \p uploadStream once the graph has been instantiated.
+ *
+ * - ::cudaGraphInstantiateFlagDeviceLaunch, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::cudaGraphInstantiateFlagAutoFreeOnLaunch.
+ *
+ * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p graph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to instantiate a
+ * second executable graph before destroying the first with ::cudaGraphExecDestroy will
+ * result in an error.
+ * The same also applies if \p graph contains any device-updatable kernel nodes.
+ *
+ * If \p graph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * devices, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single device.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current device, and the current device must
+ *     match the device of other nodes in the graph.
+ *
+ * In the event of an error, the \p result_out and \p errNode_out fields will contain more
+ * information about the nature of the error. Possible error reporting includes:
+ *
+ * - ::cudaGraphInstantiateError, if passed an invalid value or if an unexpected error occurred
+ *   which is described by the return value of the function. \p errNode_out will be set to NULL.
+ * - ::cudaGraphInstantiateInvalidStructure, if the graph structure is invalid. \p errNode_out
+ *   will be set to one of the offending nodes.
+ * - ::cudaGraphInstantiateNodeOperationNotSupported, if the graph is instantiated for device
+ *   launch but contains a node of an unsupported node type, or a node which performs unsupported
+ *   operations, such as use of CUDA dynamic parallelism within a kernel node. \p errNode_out will
+ *   be set to this node.
+ * - ::cudaGraphInstantiateMultipleDevicesNotSupported, if the graph is instantiated for device
+ *   launch but a node’s device differs from that of another node. This error can also be returned
+ *   if a graph is not instantiated for device launch and it contains kernels which call device-side
+ *   cudaGraphLaunch() from multiple devices. \p errNode_out will be set to this node.
+ *
+ * If instantiation is successful, \p result_out will be set to ::cudaGraphInstantiateSuccess,
+ * and \p hErrNode_out will be set to NULL.
+ *
+ * \param pGraphExec       - Returns instantiated graph
+ * \param graph            - Graph to instantiate
+ * \param instantiateParams - Instantiation parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithParams(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphInstantiateParams *instantiateParams);
+
+/**
+ * \brief Query the instantiation flags of an executable graph
+ *
+ * Returns the flags that were passed to instantiation for the given executable graph.
+ * ::cudaGraphInstantiateFlagUpload will not be returned by this API as it does
+ * not affect the resulting executable graph.
+ *
+ * \param graphExec - The executable graph to query
+ * \param flags     - Returns the instantiation flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphInstantiateWithParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long *flags);
+
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
+ * The node is identified by the corresponding node \p node in the 
+ * non-executable graph, from which the executable graph was instantiated. 
+ *
+ * \p node must not have been removed from the original graph. All \p nodeParams 
+ * fields may change, but the following restrictions apply to \p func updates: 
+ *
+ *   - The owning device of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same device as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already 
+ * enqueued or running launches of \p hGraphExec are not affected by this call. 
+ * \p node is also not modified by this call.
+ *
+ * If \p node is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ * will overwrite any previous device-side updates. Additionally, applying host updates to a
+ * device-updatable kernel node while it is being updated from the device will result in
+ * undefined behavior.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - kernel node from the graph from which graphExec was instantiated
+ * \param pNodeParams - Updated Parameters to set
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The source and destination memory in \p pNodeParams must be allocated from the same 
+ * contexts as the original source and destination memory.  Both the instantiation-time 
+ * memory operands and the memory operands in \p pNodeParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * either the original or new memory operands are multidimensional.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Memcpy node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams1D,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p symbol must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p symbol and \p dst must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional copy
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p dst must be allocated from the same contexts as the original source
+ * and destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNode1D,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The destination memory in \p pNodeParams must be allocated from the same 
+ * context as the original destination memory.  Both the instantiation-time 
+ * memory operand and the memory operand in \p pNodeParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns cudaErrorInvalidValue if the memory operand's mappings changed or
+ * either the original or new memory operand are multidimensional.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Memset node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a host node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Host node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though the nodes contained
+ * in \p node's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ * \p node must remain in the graph which was used to instantiate \p hGraphExec.
+ * Changed edges to and from \p node are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also
+ * not modified by this call.
+ *
+ * The topology of \p childGraph, as well as the node insertion order,  must match that
+ * of the graph contained in \p node.  See ::cudaGraphExecUpdate() for a list of restrictions
+ * on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ * nodes contained within the top level child graph will also be updated.
+
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param node       - Host node from the graph which was used to instantiate graphExec
+ * \param childGraph - The graph supplying the updated parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph);
+#endif
+
+/**
+ * \brief Sets the event for an event record node in the given graphExec
+ *
+ * Sets the event of an event record node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Event record node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Sets the event for an event wait node in the given graphExec
+ *
+ * Sets the event of an event wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Event wait node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Sets the parameters for an external semaphore signal node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Sets the parameters for an external semaphore wait node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExecNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Enables or disables the specified node in the given graphExec
+ *
+ * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent 
+ * to empty nodes until they are reenabled. Existing node parameters are not affected by 
+ * disabling/enabling the node.
+ *  
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetEnabled,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ * ::cudaGraphLaunch
+ */
+#if __CUDART_API_VERSION >= 11060
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled);
+#endif
+
+/**
+ * \brief Query whether a node in the given graphExec is enabled
+ *
+ * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
+ *
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Location to return the enabled status of the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetEnabled,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ * ::cudaGraphLaunch
+ */
+#if __CUDART_API_VERSION >= 11060
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int *isEnabled);
+#endif
+
+/**
+ * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+ *
+ * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ * node parameters in a topologically identical graph specified by \p hGraph.
+ *
+ * Limitations:
+ *
+ * - Kernel nodes:
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP.
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
+ *   - If the graph was instantiated with cudaGraphInstantiateFlagUseNodePriority, the
+ *     priority attribute cannot change. Equality is checked on the originally requested
+ *     priority values, before they are clamped to the device's supported range.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same device as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *   - Neither \p hGraph nor \p hGraphExec may contain device-updatable kernel nodes.
+ * - Memset and memcpy nodes:
+ *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+ *   - The source/destination memory must be allocated from the same contexts as the original
+ *     source/destination memory.
+ *   - Only 1D memsets can be changed.
+ * - Additional memcpy node restrictions:
+ *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+ *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ * - Conditional nodes:
+ *   - Changing node parameters is not supported.
+ *   - Changeing parameters of nodes within the conditional body graph is subject to the rules above.
+ *   - Conditional handle flags and default values are updated as part of the graph update.
+ *
+ * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+ *
+ * cudaGraphExecUpdate sets the result member of \p resultInfo to cudaGraphExecUpdateErrorTopologyChanged
+ * under the following conditions:
+ * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode
+ *   is set to NULL.
+ * - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of
+ *   the exit nodes in hGraph. 
+ * - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with,
+ *   in which case resultInfo->errorNode is set to the node from \p hGraph.
+ * - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node
+ *   from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode
+ *   will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency
+ *   does not match when the nodes are already paired based on other edges examined in the graph.
+ *
+ * cudaGraphExecUpdate sets \p the result member of \p resultInfo to:
+ * - cudaGraphExecUpdateError if passed an invalid value.
+ * - cudaGraphExecUpdateErrorTopologyChanged if the graph topology changed
+ * - cudaGraphExecUpdateErrorNodeTypeChanged if the type of a node changed, in which case
+ *   \p hErrorNode_out is set to the node from \p hGraph.
+ * - cudaGraphExecUpdateErrorFunctionChanged if the function of a kernel node changed (CUDA driver < 11.2)
+ * - cudaGraphExecUpdateErrorUnsupportedFunctionChange if the func field of a kernel changed in an
+ *   unsupported way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorParametersChanged if any parameters to a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorAttributesChanged if any attributes of a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorNotSupported if something about a node is unsupported, like 
+ *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+ *
+ * If the update fails for a reason not listed above, the result member of \p resultInfo will be set
+ * to cudaGraphExecUpdateError. If the update succeeds, the result member will be set to cudaGraphExecUpdateSuccess.
+ *
+ * cudaGraphExecUpdate returns cudaSuccess when the updated was performed successfully.  It returns
+ * cudaErrorGraphExecUpdateFailure if the graph update was not performed because it included 
+ * changes which violated constraints specific to instantiated graph update.
+ *
+ * \param hGraphExec The instantiated graph to be updated
+ * \param hGraph The graph containing the updated parameters
+   \param resultInfo the error info structure
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorGraphExecUpdateFailure,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphExecUpdateResultInfo *resultInfo);
+
+/**
+ * \brief Uploads an executable graph in a stream
+ *
+ * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ * previous work in \p hStream and any previous launches of \p hGraphExec.
+ * Uses memory cached by \p stream to back the allocations owned by \p graphExec.
+ *
+ * \param hGraphExec - Executable graph to upload
+ * \param hStream    - Stream in which to upload the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_init_rt
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream);
+#endif
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p graphExec in \p stream. Only one instance of \p graphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p stream
+ * and any previous launches of \p graphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * If any allocations created by \p graphExec remain unfreed (from a previous launch) and
+ * \p graphExec was not instantiated with ::cudaGraphInstantiateFlagAutoFreeOnLaunch,
+ * the launch will fail with ::cudaErrorInvalidValue.
+ *
+ * \param graphExec - Executable graph to launch
+ * \param stream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p graphExec.
+ *
+ * \param graphExec - Executable graph to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecDestroy(cudaGraphExec_t graphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p graph, as well as all of its nodes.
+ *
+ * \param graph - Graph to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph);
+
+/**
+ * \brief Write a DOT file describing graph structure
+ *
+ * Using the provided \p graph, write to \p path a DOT formatted description of the graph.
+ * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ * \p flags can be specified to write more detailed information about each node type such as
+ * parameter values, kernel attributes, node and function handles.
+ *
+ * \param graph - The graph to create a DOT file from
+ * \param path  - The path to write the DOT file to
+ * \param flags - Flags from cudaGraphDebugDotFlags for specifying which additional node information to write
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOperatingSystem
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(cudaGraph_t graph, const char *path, unsigned int flags);
+
+/**
+ * \brief Create a user object
+ *
+ * Create a user object with the specified destructor callback and initial reference count. The
+ * initial references are owned by the caller.
+ *
+ * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ * are executed by a shared internal thread. Another thread may be signaled to perform such
+ * actions, if it does not block forward progress of tasks scheduled through CUDA.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param ptr             - The pointer to pass to the destroy function
+ * \param destroy         - Callback to free the user object when it is no longer in use
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass ::cudaUserObjectNoDestructorSync,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy, unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * \brief Retain a reference to a user object
+ *
+ * Retains new references to a user object. The new references are owned by the caller.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to retain
+ * \param count  - The number of references to retain, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Release a reference to a user object
+ *
+ * Releases user object references owned by the caller. The object's destructor is invoked if
+ * the reference count reaches zero.
+ *
+ * It is undefined behavior to release references not owned by the caller, or to use a user
+ * object handle after all references are released.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to release
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate,
+ * ::cudaUserObjectRetain,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Retain a reference to a user object from a graph
+ *
+ * Creates or moves user object references that will be owned by a CUDA graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph to associate the reference with
+ * \param object - The user object to retain a reference for
+ * \param count  - The number of references to add to the graph, typically 1. Must be
+ *                 nonzero and not larger than INT_MAX.
+ * \param flags  - The optional flag ::cudaGraphUserObjectMove transfers references
+ *                 from the calling thread, rather than create new references. Pass 0
+ *                 to create new references.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1), unsigned int flags __dv(0));
+
+/**
+ * \brief Release a user object reference from a graph
+ *
+ * Releases user object references owned by a graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph that will release the reference
+ * \param object - The user object to release a reference for
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Adds a node of arbitrary type to a graph
+ *
+ * Creates a new node in \p graph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p pDependencies. \p numDependencies may be 0.
+ * \p pDependencies may be null if \p numDependencies is 0. \p pDependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphExecNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Adds a node of arbitrary type to a graph (12.3+)
+ *
+ * Creates a new node in \p graph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p pDependencies. \p numDependencies may be 0.
+ * \p pDependencies may be null if \p numDependencies is 0. \p pDependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param dependencyData  - Optional edge data for the dependencies. If NULL, the data is
+ *                          assumed to be default (zeroed) for all dependencies.
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphNodeSetParams,
+ * ::cudaGraphExecNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddNode_v2(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters
+ *
+ * Sets the parameters of graph node \p node to \p nodeParams. The node type specified by
+ * \p nodeParams->type must match the type of \p node. \p nodeParams must be fully
+ * initialized and all unused bytes (reserved, padding) zeroed.
+ *
+ * Modifying parameters is not supported for node types cudaGraphNodeTypeMemAlloc and
+ * cudaGraphNodeTypeMemFree.
+ *
+ * \param node       - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphExecNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetParams(cudaGraphNode_t node, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters in an instantiated graph
+ *
+ * Sets the parameters of a node in an executable graph \p graphExec. The node is identified
+ * by the corresponding node \p node in the non-executable graph from which the executable
+ * graph was instantiated. \p node must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p graphExec. Already
+ * enqueued or running launches of \p graphExec are not affected by this call.
+ * \p node is also not modified by this call.
+ *
+ * Allowed changes to parameters on executable graphs are as follows:
+ * <table>
+ *   <tr><th>Node type<th>Allowed changes
+ *   <tr><td>kernel<td>See ::cudaGraphExecKernelNodeSetParams
+ *   <tr><td>memcpy<td>Addresses for 1-dimensional copies if allocated in same context; see ::cudaGraphExecMemcpyNodeSetParams
+ *   <tr><td>memset<td>Addresses for 1-dimensional memsets if allocated in same context; see ::cudaGraphExecMemsetNodeSetParams
+ *   <tr><td>host<td>Unrestricted
+ *   <tr><td>child graph<td>Topology must match and restrictions apply recursively; see ::cudaGraphExecUpdate
+ *   <tr><td>event wait<td>Unrestricted
+ *   <tr><td>event record<td>Unrestricted
+ *   <tr><td>external semaphore signal<td>Number of semaphore operations cannot change
+ *   <tr><td>external semaphore wait<td>Number of semaphore operations cannot change
+ *   <tr><td>memory allocation<td>API unsupported
+ *   <tr><td>memory free<td>API unsupported
+ * </table>
+ *
+ * \param graphExec  - The executable graph in which to update the specified node
+ * \param node       - Corresponding node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorNotSupported
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddNode,
+ * ::cudaGraphNodeSetParams
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecNodeSetParams(cudaGraphExec_t graphExec, cudaGraphNode_t node, struct cudaGraphNodeParams *nodeParams);
+
+/**
+ * \brief Create a conditional handle
+ *
+ * Creates a conditional handle associated with \p hGraph.
+ *
+ * The conditional handle must be associated with a conditional node in this graph or one of its children.
+ *  
+ * Handles not associated with a conditional node may cause graph instantiation to fail. 
+ *
+ * \param pHandle_out        - Pointer used to return the handle to the caller.
+ * \param hGraph             - Graph which will contain the conditional node using this handle.
+ * \param defaultLaunchValue - Optional initial value for the conditional variable.
+ * \param flags              - Currently must be cudaGraphCondAssignDefault or 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphConditionalHandleCreate(cudaGraphConditionalHandle *pHandle_out, cudaGraph_t graph, unsigned int defaultLaunchValue __dv(0), unsigned int flags __dv(0));
+
+/** @} */ /* END CUDART_GRAPH */
+
+/**
+ * \defgroup CUDART_DRIVER_ENTRY_POINT Driver Entry Point Access
+ *
+ * ___MANBRIEF___ driver entry point access functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the driver entry point access functions of CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the requested driver API function pointer
+ *
+ * Returns in \p **funcPtr the address of the CUDA driver function for the requested flags.
+ *
+ * For a requested driver symbol, if the CUDA version in which the driver symbol was
+ * introduced is less than or equal to the CUDA runtime version, the API will return
+ * the function pointer to the corresponding versioned driver function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * The API will return ::cudaSuccess and set the returned \p funcPtr to NULL if the
+ * requested driver function is not supported on the platform, no ABI
+ * compatible driver function exists for the CUDA runtime version or if the
+ * driver symbol is invalid.
+ *
+ * It will also set the optional \p driverStatus to one of the values in 
+ * ::cudaDriverEntryPointQueryResult with the following meanings:
+ * - ::cudaDriverEntryPointSuccess - The requested symbol was succesfully found based
+ *   on input arguments and \p pfn is valid
+ * - ::cudaDriverEntryPointSymbolNotFound - The requested symbol was not found
+ * - ::cudaDriverEntryPointVersionNotSufficent - The requested symbol was found but is
+ *   not supported by the current runtime version (CUDART_VERSION)
+ *
+ * The requested flags can be:
+ * - ::cudaEnableDefault: This is the default mode. This is equivalent to
+ *   ::cudaEnablePerThreadDefaultStream if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::cudaEnableLegacyStream otherwise.
+ * - ::cudaEnableLegacyStream: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::cudaEnablePerThreadDefaultStream: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc.
+ *                 Note that the API will use the CUDA runtime version to return the
+ *                 address to the most recent ABI compatible driver symbol, ::cuMemAlloc
+ *                 or ::cuMemAlloc_v2.
+ * \param funcPtr - Location to return the function pointer to the requested driver function
+ * \param flags -  Flags to specify search options.
+ * \param driverStatus - Optional location to store the status of finding the symbol from
+ *                       the driver. See ::cudaDriverEntryPointQueryResult for 
+ *                       possible values.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_version_mixing
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuGetProcAddress
+ */
+#if defined(__cplusplus)
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus = NULL);
+#else
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus);
+#endif
+
+/** @} */ /* END CUDART_DRIVER_ENTRY_POINT */
+
+/** \cond impl_private */
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId);
+/** \endcond impl_private */
+
+/**
+ * \defgroup CUDART_HIGHLEVEL C++ API Routines
+ *
+ * ___MANBRIEF___ C++ high level API functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the C++ high level API functions of the CUDA runtime
+ * application programming interface. To use these functions, your
+ * application needs to be compiled with the \p nvcc compiler.
+ *
+ * \brief C++-style interface built on top of CUDA runtime API
+ */
+
+/**
+ * \defgroup CUDART_DRIVER Interactions with the CUDA Driver API
+ *
+ * ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
+ *
+ * @{
+ *
+ * \section CUDART_CUDA_primary Primary Contexts
+ *
+ * There exists a one to one relationship between CUDA devices in the CUDA Runtime
+ * API and ::CUcontext s in the CUDA Driver API within a process.  The specific
+ * context which the CUDA Runtime API uses for a device is called the device's
+ * primary context.  From the perspective of the CUDA Runtime API, a device and 
+ * its primary context are synonymous.
+ *
+ * \section CUDART_CUDA_init Initialization and Tear-Down
+ *
+ * CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to
+ * to the calling host thread.
+ * 
+ * The function ::cudaInitDevice() ensures that the primary context is initialized
+ * for the requested device but does not make it current to the calling thread. 
+ *
+ * The function ::cudaSetDevice() initializes the primary context for the
+ * specified device and makes it current to the calling thread by calling ::cuCtxSetCurrent().
+ *
+ * The CUDA Runtime API will automatically initialize the primary context for
+ * a device at the first CUDA Runtime API call which requires an active context.
+ * If no ::CUcontext is current to the calling thread when a CUDA Runtime API call 
+ * which requires an active context is made, then the primary context for a device 
+ * will be selected, made current to the calling thread, and initialized.
+ *
+ * The context which the CUDA Runtime API initializes will be initialized using 
+ * the parameters specified by the CUDA Runtime API functions
+ * ::cudaSetDeviceFlags(), 
+ * ::cudaD3D9SetDirect3DDevice(), 
+ * ::cudaD3D10SetDirect3DDevice(), 
+ * ::cudaD3D11SetDirect3DDevice(), 
+ * ::cudaGLSetGLDevice(), and
+ * ::cudaVDPAUSetVDPAUDevice().
+ * Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are 
+ * called when the primary context for the specified device has already been initialized.
+ * (or if the current device has already been initialized, in the case of 
+ * ::cudaSetDeviceFlags()). 
+ *
+ * Primary contexts will remain active until they are explicitly deinitialized 
+ * using ::cudaDeviceReset().  The function ::cudaDeviceReset() will deinitialize the 
+ * primary context for the calling thread's current device immediately.  The context 
+ * will remain current to all of the threads that it was current to.  The next CUDA 
+ * Runtime API call on any thread which requires an active context will trigger the 
+ * reinitialization of that device's primary context.
+ *
+ * Note that primary contexts are shared resources. It is recommended that
+ * the primary context not be reset except just before exit or to recover from an
+ * unspecified launch failure.
+ * 
+ * \section CUDART_CUDA_context Context Interoperability
+ *
+ * Note that the use of multiple ::CUcontext s per device within a single process 
+ * will substantially degrade performance and is strongly discouraged.  Instead,
+ * it is highly recommended that the implicit one-to-one device-to-context mapping
+ * for the process provided by the CUDA Runtime API be used.
+ *
+ * If a non-primary ::CUcontext created by the CUDA Driver API is current to a
+ * thread then the CUDA Runtime API calls to that thread will operate on that 
+ * ::CUcontext, with some exceptions listed below.  Interoperability between data
+ * types is discussed in the following sections.
+ *
+ * The function ::cudaPointerGetAttributes() will return the error 
+ * ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a 
+ * non-primary context.  The function ::cudaDeviceEnablePeerAccess() and the rest of 
+ * the peer access API may not be called when a non-primary ::CUcontext is current.  
+ * To use the pointer query and peer access APIs with a context created using the 
+ * CUDA Driver API, it is necessary that the CUDA Driver API be used to access
+ * these features.
+ *
+ * All CUDA Runtime API state (e.g, global variables' addresses and values) travels
+ * with its underlying ::CUcontext.  In particular, if a ::CUcontext is moved from one 
+ * thread to another then all CUDA Runtime API state will move to that thread as well.
+ *
+ * Please note that attaching to legacy contexts (those with a version of 3010 as returned
+ * by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return
+ * ::cudaErrorIncompatibleDriverContext in such cases.
+ *
+ * \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t
+ *
+ * The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably.
+ *
+ * \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t
+ *
+ * The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably.
+ *
+ * \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t 
+ *
+ * The types ::CUarray and struct ::cudaArray * represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *,
+ * it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
+ *
+ * In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray,
+ * it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
+ *
+ * \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t
+ *
+ * The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a 
+ * ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource 
+ * to a ::cudaGraphicsResource_t.
+ *
+ * In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a
+ * ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t 
+ * to a ::CUgraphicsResource.
+ *
+ * \section CUDART_CUDA_texture_objects Interactions between CUtexObject and cudaTextureObject_t
+ *
+ * The types ::CUtexObject and ::cudaTextureObject_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUtexObject in a CUDA Runtime API function which takes a ::cudaTextureObject_t,
+ * it is necessary to explicitly cast the ::CUtexObject to a ::cudaTextureObject_t.
+ *
+ * In order to use a ::cudaTextureObject_t in a CUDA Driver API function which takes a ::CUtexObject,
+ * it is necessary to explicitly cast the ::cudaTextureObject_t to a ::CUtexObject.
+ *
+ * \section CUDART_CUDA_surface_objects Interactions between CUsurfObject and cudaSurfaceObject_t
+ *
+ * The types ::CUsurfObject and ::cudaSurfaceObject_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a ::cudaSurfaceObject_t,
+ * it is necessary to explicitly cast the ::CUsurfObject to a ::cudaSurfaceObject_t.
+ *
+ * In order to use a ::cudaSurfaceObject_t in a CUDA Driver API function which takes a ::CUsurfObject,
+ * it is necessary to explicitly cast the ::cudaSurfaceObject_t to a ::CUsurfObject.
+ *
+ * \section CUDART_CUDA_module Interactions between CUfunction and cudaFunction_t
+ *
+ * The types ::CUfunction and ::cudaFunction_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction,
+ * it is necessary to explicitly cast the ::cudaFunction_t to a ::CUfunction.
+ *
+ */
+
+ /**
+  * \brief Get pointer to device entry function that matches entry function \p symbolPtr
+  *
+  * Returns in \p functionPtr the device entry function corresponding to the symbol \p symbolPtr.
+  *
+  * \param functionPtr     - Returns the device entry function
+  * \param symbolPtr       - Pointer to device entry function to search for
+  *
+  * \return
+  * ::cudaSuccess
+  *
+  */
+extern __host__ cudaError_t cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolPtr);
+
+/**
+ * \brief Get pointer to device kernel that matches entry function \p entryFuncAddr
+  *
+  * Returns in \p kernelPtr the device kernel corresponding to the entry function \p entryFuncAddr.
+  *
+  * \param kernelPtr          - Returns the device kernel
+  * \param entryFuncAddr      - Address of device entry function to search kernel for
+  *
+  * \return
+  * ::cudaSuccess
+  *
+  * \sa
+  * \ref ::cudaGetKernel(cudaKernel_t *kernelPtr, const T *entryFuncAddr) "cudaGetKernel (C++ API)"
+  */
+extern __host__ cudaError_t CUDARTAPI cudaGetKernel(cudaKernel_t *kernelPtr, const void *entryFuncAddr);
+
+/** @} */ /* END CUDART_DRIVER */
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cudaMemcpy
+    #undef cudaMemcpyToSymbol
+    #undef cudaMemcpyFromSymbol
+    #undef cudaMemcpy2D
+    #undef cudaMemcpyToArray
+    #undef cudaMemcpy2DToArray
+    #undef cudaMemcpyFromArray
+    #undef cudaMemcpy2DFromArray
+    #undef cudaMemcpyArrayToArray
+    #undef cudaMemcpy2DArrayToArray
+    #undef cudaMemcpy3D
+    #undef cudaMemcpy3DPeer
+    #undef cudaMemset
+    #undef cudaMemset2D
+    #undef cudaMemset3D
+    #undef cudaMemcpyAsync
+    #undef cudaMemcpyToSymbolAsync
+    #undef cudaMemcpyFromSymbolAsync
+    #undef cudaMemcpy2DAsync
+    #undef cudaMemcpyToArrayAsync
+    #undef cudaMemcpy2DToArrayAsync
+    #undef cudaMemcpyFromArrayAsync
+    #undef cudaMemcpy2DFromArrayAsync
+    #undef cudaMemcpy3DAsync
+    #undef cudaMemcpy3DPeerAsync
+    #undef cudaMemsetAsync
+    #undef cudaMemset2DAsync
+    #undef cudaMemset3DAsync
+    #undef cudaStreamQuery
+    #undef cudaStreamGetFlags
+    #undef cudaStreamGetId
+    #undef cudaStreamGetPriority
+    #undef cudaEventRecord
+    #undef cudaEventRecordWithFlags
+    #undef cudaStreamWaitEvent
+    #undef cudaStreamAddCallback
+    #undef cudaStreamAttachMemAsync
+    #undef cudaStreamSynchronize
+    #undef cudaLaunchKernel
+    #undef cudaLaunchKernelExC
+    #undef cudaLaunchHostFunc
+    #undef cudaMemPrefetchAsync
+    #undef cudaMemPrefetchAsync_v2
+    #undef cudaLaunchCooperativeKernel
+    #undef cudaSignalExternalSemaphoresAsync
+    #undef cudaWaitExternalSemaphoresAsync
+    #undef cudaGraphInstantiateWithParams
+    #undef cudaGraphUpload
+    #undef cudaGraphLaunch
+    #undef cudaStreamBeginCapture
+    #undef cudaStreamBeginCaptureToGraph
+    #undef cudaStreamEndCapture
+    #undef cudaStreamIsCapturing
+    #undef cudaStreamGetCaptureInfo
+    #undef cudaStreamGetCaptureInfo_v2
+    #undef cudaStreamGetCaptureInfo_v3
+    #undef cudaStreamUpdateCaptureDependencies
+    #undef cudaStreamUpdateCaptureDependencies_v2
+    #undef cudaStreamCopyAttributes
+    #undef cudaStreamGetAttribute
+    #undef cudaStreamSetAttribute
+    #undef cudaMallocAsync
+    #undef cudaFreeAsync
+    #undef cudaMallocFromPoolAsync
+    #undef cudaGetDriverEntryPoint
+
+    #undef cudaGetDeviceProperties
+
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetId(cudaStream_t hStream, unsigned long long *streamId);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);
+    extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync_v2(const void *devPtr, size_t count, struct cudaMemLocation location, unsigned int flags, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithParams(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphInstantiateParams *instantiateParams);
+    extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCaptureToGraph(cudaStream_t stream, cudaGraph_t graph, const cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, enum cudaStreamCaptureMode mode);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_ptsz(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v3(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), const cudaGraphEdgeData **edgeData_out __dv(0), size_t *numDependencies_out __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t *dependencies, const cudaGraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dstStream, cudaStream_t srcStream);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetAttribute(cudaStream_t stream, cudaStreamAttrID attr, cudaStreamAttrValue *value);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamSetAttribute(cudaStream_t stream, cudaStreamAttrID attr, const cudaStreamAttrValue *param);
+
+    extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream);
+    extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream);
+    extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus);
+
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
+
+#elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+    // nvcc stubs reference the 'cudaLaunch'/'cudaLaunchKernel' identifier even if it was defined
+    // to 'cudaLaunch_ptsz'/'cudaLaunchKernel_ptsz'. Redirect through a static inline function.
+    #undef cudaLaunchKernel
+    static __inline__ __host__ cudaError_t cudaLaunchKernel(const void *func, 
+                                                            dim3 gridDim, dim3 blockDim, 
+                                                            void **args, size_t sharedMem, 
+                                                            cudaStream_t stream)
+    {
+        return cudaLaunchKernel_ptsz(func, gridDim, blockDim, args, sharedMem, stream);
+    }
+    #define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel)
+    #undef cudaLaunchKernelExC
+    static __inline__ __host__ cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t *config,
+                                                               const void *func,
+                                                                  void **args)
+    {
+        return cudaLaunchKernelExC_ptsz(config, func, args);
+    }
+    #define cudaLaunchKernelExC __CUDART_API_PTSZ(cudaLaunchKernelExC)
+#endif
+
+#if defined(__cplusplus)
+}
+
+#endif /* __cplusplus */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+
+#undef __dv
+#undef __CUDA_DEPRECATED
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__
+#endif
+
+#endif /* !__CUDA_RUNTIME_API_H__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_stdint.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_stdint.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a9814410e4b6fb4f07ad9edc8394e956b77dbcd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_stdint.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2009-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __cuda_stdint_h__
+#define __cuda_stdint_h__
+
+// Compiler-specific treatment for C99's stdint.h
+//
+// By default, this header will use the standard headers (so it
+// is your responsibility to make sure they are available), except
+// on MSVC before Visual Studio 2010, when they were not provided.
+// To support old MSVC, a few of the commonly-used definitions are
+// provided here.  If more definitions are needed, add them here,
+// or replace these definitions with a complete implementation,
+// such as the ones available from Google, Boost, or MSVC10.  You
+// can prevent the definition of any of these types (in order to
+// use your own) by #defining CU_STDINT_TYPES_ALREADY_DEFINED.
+
+#if !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+// In VS including stdint.h forces the C++ runtime dep - provide an opt-out
+// (CU_STDINT_VS_FORCE_NO_STDINT_H) for users that care (notably static
+// cudart).
+#if defined(_MSC_VER) && ((_MSC_VER < 1600) || defined(CU_STDINT_VS_FORCE_NO_STDINT_H))
+
+// These definitions can be used with MSVC 8 and 9,
+// which don't ship with stdint.h:
+
+typedef unsigned   char   uint8_t;
+
+typedef            short  int16_t;
+typedef unsigned   short uint16_t;
+
+// To keep it consistent with all MSVC build. define those types
+// in the exact same way they are defined with the MSVC headers
+#if defined(_MSC_VER)
+typedef signed     char    int8_t;
+
+typedef            int     int32_t;
+typedef unsigned   int     uint32_t;
+
+typedef long long          int64_t;
+typedef unsigned long long uint64_t;
+#else
+typedef            char    int8_t;
+
+typedef            long   int32_t;
+typedef unsigned   long  uint32_t;
+
+typedef          __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+#endif
+
+#elif defined(__DJGPP__)
+
+// These definitions can be used when compiling
+// C code with DJGPP, which only provides stdint.h
+// when compiling C++ code with TR1 enabled.
+
+typedef               char    int8_t;
+typedef unsigned      char   uint8_t;
+
+typedef               short  int16_t;
+typedef unsigned      short uint16_t;
+
+typedef               long   int32_t;
+typedef unsigned      long  uint32_t;
+
+typedef          long long   int64_t;
+typedef unsigned long long  uint64_t;
+
+#else
+
+// Use standard headers, as specified by C99 and C++ TR1.
+// Known to be provided by:
+// - gcc/glibc, supported by all versions of glibc
+// - djgpp, supported since 2001
+// - MSVC, supported by Visual Studio 2010 and later
+
+#include <stdint.h>
+
+#endif
+
+#endif // !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+
+#endif // file guard
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_surface_types.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a35c215668e98006c3eaa286deb70461eb1fa62
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_surface_types.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_SURFACE_TYPES_H__)
+#define __CUDA_SURFACE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_SURFACE_TYPES_H__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cf1ba357eb02ed82afc2f1812627a8a2d88c6f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_VDPAU_INTEROP_H__)
+#define __CUDA_VDPAU_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+#include <vdpau/vdpau.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_VDPAU VDPAU Interoperability
+ * This section describes the VDPAU interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the CUDA device associated with a VdpDevice.
+ *
+ * Returns the CUDA device associated with a VdpDevice, if applicable.
+ *
+ * \param device - Returns the device associated with vdpDevice, or -1 if
+ * the device associated with vdpDevice is not a compute device.
+ * \param vdpDevice - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cuVDPAUGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUGetDevice(int *device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Sets a CUDA device to use VDPAU interoperability
+ *
+ * Records \p vdpDevice as the VdpDevice for VDPAU interoperability 
+ * with the CUDA device \p device and sets \p device as the current 
+ * device for the calling host thread.
+ *
+ * This function will immediately initialize the primary context on 
+ * \p device if needed.
+ *
+ * If \p device has already been initialized then this call will fail 
+ * with the error ::cudaErrorSetOnActiveProcess.  In this case it is 
+ * necessary to reset \p device using ::cudaDeviceReset() before 
+ * VDPAU interoperability on \p device may be enabled.
+ *
+ * \param device - Device to use for VDPAU interoperability
+ * \param vdpDevice - The VdpDevice to interoperate with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsVDPAURegisterVideoSurface,
+ * ::cudaGraphicsVDPAURegisterOutputSurface,
+ * ::cudaDeviceReset
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Register a VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterVideoSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterVideoSurface(struct cudaGraphicsResource **resource, VdpVideoSurface vdpSurface, unsigned int flags);
+
+/**
+ * \brief Register a VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterOutputSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterOutputSurface(struct cudaGraphicsResource **resource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/** @} */ /* END CUDART_VDPAU */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* __CUDA_VDPAU_INTEROP_H__ */
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudart_platform.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudart_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f022bbe349eba2219a6b74f1ea315c1ce8551b7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudart_platform.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDART_PLATFORM_H__
+#define __CUDART_PLATFORM_H__
+
+#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)))
+#define isEglSupported 1
+#endif
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti.h
new file mode 100644
index 0000000000000000000000000000000000000000..be316531dcfd846bcea8feadf3604437ce2447a1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2010-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_H_)
+#define _CUPTI_H_
+
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifdef NOMINMAX
+#include <windows.h>
+#else
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#endif
+#endif
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_version.h>
+
+/* Activity, callback, event and metric APIs */
+#include <cupti_activity.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+
+/* Runtime, driver, and nvtx function identifiers */
+#include <cupti_driver_cbid.h>
+#include <cupti_runtime_cbid.h>
+#include <cupti_nvtx_cbid.h>
+
+/* To support function parameter structures for obsoleted API. See
+   cuda.h for the actual definition of these structures. */
+typedef unsigned int CUdeviceptr_v1;
+typedef struct CUDA_MEMCPY2D_v1_st { int dummy; } CUDA_MEMCPY2D_v1;
+typedef struct CUDA_MEMCPY3D_v1_st { int dummy; } CUDA_MEMCPY3D_v1;
+typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY_DESCRIPTOR_v1;
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY3D_DESCRIPTOR_v1;
+
+/* Function parameter structures */
+#include <generated_cuda_runtime_api_meta.h>
+#include <generated_cuda_meta.h>
+
+/* The following parameter structures cannot be included unless a
+   header that defines GL_VERSION is included before including them.
+   If these are needed then make sure such a header is included
+   already. */
+#ifdef GL_VERSION
+#include <generated_cuda_gl_interop_meta.h>
+#include <generated_cudaGL_meta.h>
+#endif
+
+//#include <generated_nvtx_meta.h>
+
+/* The following parameter structures cannot be included by default as
+   they are not guaranteed to be available on all systems. Uncomment
+   the includes that are available, or use the include explicitly. */
+#if defined(__linux__)
+//#include <generated_cuda_vdpau_interop_meta.h>
+//#include <generated_cudaVDPAU_meta.h>
+#endif
+
+#ifdef _WIN32
+//#include <generated_cuda_d3d9_interop_meta.h>
+//#include <generated_cuda_d3d10_interop_meta.h>
+//#include <generated_cuda_d3d11_interop_meta.h>
+//#include <generated_cudaD3D9_meta.h>
+//#include <generated_cudaD3D10_meta.h>
+//#include <generated_cudaD3D11_meta.h>
+#endif
+
+#endif /*_CUPTI_H_*/
+
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..084ea84ed7be17af6d1634d772fd270fb5a0351f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h
@@ -0,0 +1,4784 @@
+/*
+ * Copyright 2011-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_DEPRECATED_H_)
+#define _CUPTI_ACTIVITY_DEPRECATED_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_ActivityOverhead
+ * \see CUpti_ActivityOverhead2
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ */
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ * (Deprecated in CUDA 12.2)
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD). These records are now reported using
+ * CUpti_ActivityOverhead3
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+} CUpti_ActivityOverhead;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+} CUpti_ActivityOverhead2;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice2;
+
+/**
+ * \brief The activity record for a device. (CUDA 7.0 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  uint8_t reserved[7];
+} CUpti_ActivityDevice3;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+} CUpti_ActivityDevice4;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL
+   * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The cache configuration requested by the kernel. The value is one
+   * of the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigRequested;
+
+  /**
+   * The cache configuration used for the kernel. The value is one of
+   * the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigExecuted;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the kernel. Each kernel execution
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t runtimeCorrelationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel2;
+
+/**
+ * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards).
+ * (deprecated in CUDA 9.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel3;
+
+/**
+ * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards).
+ * (deprecated in CUDA 11.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+} CUpti_ActivityKernel4;
+
+/**
+ * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards).
+ * (deprecated in CUDA 11.2)
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+} CUpti_ActivityKernel5;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+} CUpti_ActivityKernel6;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.8)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityKernel7;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+} CUpti_ActivityKernel8;
+
+/**
+ * \brief The activity record for memory copies. (deprecated)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpy;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpy3;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpy4;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated
+ * by CUPTI. Peer-to-peer memory copy activities are now reported using the
+ * CUpti_ActivityMemcpyPtoP2 activity record..
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpyPtoP;
+
+typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpyPtoP2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpyPtoP3;
+
+/**
+ * \brief The activity record for memset. (deprecated)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemset;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemset2;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemset3;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+   /**
+   * The size of the memory pool in bytes and the processID of the memory pool.
+   * \p size is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   * \p processId is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
+   */
+   union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory2;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+} CUpti_ActivityMemoryPool;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time. (deprecated in CUDA 8.0)
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ * Marker activity is now reported using the
+ * CUpti_ActivityMarker2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+} CUpti_ActivityMarker;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+} CUpti_ActivityGlobalAccess;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated in CUDA 9.0)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityGlobalAccess2;
+
+/**
+ * \brief The activity record for source level result
+ * branch. (deprecated)
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ * Branch activities are now reported using the
+ * CUpti_ActivityBranch2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+} CUpti_ActivityBranch;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 8.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+} CUpti_ActivityPCSampling;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 9.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  uint32_t pad;
+} CUpti_ActivityPCSampling2;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * The ID of the device involved in the memory transfer operation.
+   * It is not relevant if the scope of the counter is global (all devices).
+   */
+  uint32_t deviceId;
+
+  /**
+   * Value of the counter
+   *
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected
+   */
+  uint64_t timestamp;
+
+  /**
+   * The ID of the process to which this record belongs to. In case of
+   * global scope, processId is undefined.
+   */
+  uint32_t processId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 9.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NVLink information are now reported using the
+* CUpti_ActivityNvLink2 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+  * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+  */
+  CUpti_ActivityKind kind;
+
+  /**
+  * NVLink version.
+  */
+  uint32_t nvlinkVersion;
+
+  /**
+  * Type of device 0 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev0;
+
+  /**
+  * Type of device 1 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+  * Flag gives capabilities of the link \see CUpti_LinkFlag
+  */
+  uint32_t flag;
+
+  /**
+  * Number of physical NVLinks present between two devices.
+  */
+  uint32_t physicalNvLinkCount;
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 0.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev0[4];
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 1.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev1[4];
+
+  /**
+  * Bandwidth of NVLink in kbytes/sec
+  */
+  uint64_t bandwidth;
+} CUpti_ActivityNvLink;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 10.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+} CUpti_ActivityNvLink2;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink3;
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ * Graph trace activity is now reported using CUpti_ActivityGraphTrace2 record.
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the graph execution is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the graph is being launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+} CUpti_ActivityGraphTrace;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ * Context activity is now reported using CUpti_ActivityContext2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+} CUpti_ActivityContext;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ *
+ * JIT activity is now reported using CUpti_ActivityJit2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+} CUpti_ActivityJit;
+
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_ACTIVITY_DEPRECATED_H_*/
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_common.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d228c4df3c1f090a4979bfe10132e080042fef
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_common.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+ 
+#if !defined(__CUPTI_COMMON_H__)
+#define __CUPTI_COMMON_H__
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#ifndef CUPTIUTILAPI
+#ifdef _WIN32
+#define CUPTIUTILAPI __stdcall
+#else
+#define CUPTIUTILAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#define ACTIVITY_RECORD_ALIGNMENT 8
+#if defined(_WIN32) // Windows 32- and 64-bit
+#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
+#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
+#define END_PACKED_ALIGNMENT __pragma(pack(pop))
+#elif defined(__GNUC__) // GCC
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
+#define END_PACKED_ALIGNMENT
+#else // all other compilers
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT
+#define END_PACKED_ALIGNMENT
+#endif
+
+#endif /*__CUPTI_COMMON_H__*/
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_driver_cbid.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_driver_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..331f2e4d63ff18f677c763570845bdc77fa25faf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_driver_cbid.h
@@ -0,0 +1,767 @@
+
+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+
+typedef enum CUpti_driver_api_trace_cbid_enum {
+    CUPTI_DRIVER_TRACE_CBID_INVALID                                                        = 0,
+    CUPTI_DRIVER_TRACE_CBID_cuInit                                                         = 1,
+    CUPTI_DRIVER_TRACE_CBID_cuDriverGetVersion                                             = 2,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGet                                                    = 3,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetCount                                               = 4,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetName                                                = 5,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceComputeCapability                                      = 6,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem                                               = 7,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetProperties                                          = 8,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetAttribute                                           = 9,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate                                                    = 10,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy                                                   = 11,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxAttach                                                    = 12,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDetach                                                    = 13,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent                                               = 14,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent                                                = 15,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevice                                                 = 16,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSynchronize                                               = 17,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoad                                                   = 18,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadData                                               = 19,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadDataEx                                             = 20,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadFatBinary                                          = 21,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleUnload                                                 = 22,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunction                                            = 23,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal                                              = 24,
+    CUPTI_DRIVER_TRACE_CBID_cu64ModuleGetGlobal                                            = 25,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetTexRef                                              = 26,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo                                                   = 27,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetInfo                                                 = 28,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc                                                     = 29,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAlloc                                                   = 30,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch                                                = 31,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAllocPitch                                              = 32,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree                                                      = 33,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemFree                                                    = 34,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange                                           = 35,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetAddressRange                                         = 36,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost                                                 = 37,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeHost                                                  = 38,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc                                                 = 39,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer                                      = 40,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostGetDevicePointer                                    = 41,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetFlags                                              = 42,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD                                                   = 43,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoD                                                 = 44,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH                                                   = 45,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoH                                                 = 46,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD                                                   = 47,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoD                                                 = 48,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA                                                   = 49,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoA                                                 = 50,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD                                                   = 51,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyAtoD                                                 = 52,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA                                                   = 53,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH                                                   = 54,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA                                                   = 55,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D                                                     = 56,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned                                            = 57,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D                                                     = 58,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3D                                                   = 59,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync                                              = 60,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoDAsync                                            = 61,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync                                              = 62,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoHAsync                                            = 63,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync                                              = 64,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoDAsync                                            = 65,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync                                              = 66,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync                                              = 67,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync                                                = 68,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync                                                = 69,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3DAsync                                              = 70,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8                                                     = 71,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8                                                   = 72,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16                                                    = 73,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16                                                  = 74,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32                                                    = 75,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32                                                  = 76,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8                                                   = 77,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8                                                 = 78,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16                                                  = 79,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16                                                = 80,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32                                                  = 81,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32                                                = 82,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetBlockShape                                            = 83,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedSize                                            = 84,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetAttribute                                             = 85,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetCacheConfig                                           = 86,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate                                                  = 87,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor                                           = 88,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayDestroy                                                 = 89,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate                                                = 90,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor                                         = 91,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefCreate                                                 = 92,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefDestroy                                                = 93,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetArray                                               = 94,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress                                             = 95,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress                                           = 96,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D                                           = 97,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress2D                                         = 98,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFormat                                              = 99,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddressMode                                         = 100,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFilterMode                                          = 101,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFlags                                               = 102,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress                                             = 103,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefGetAddress                                           = 104,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetArray                                               = 105,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddressMode                                         = 106,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFilterMode                                          = 107,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFormat                                              = 108,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFlags                                               = 109,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetSize                                                 = 110,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSeti                                                    = 111,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetf                                                    = 112,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetv                                                    = 113,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetTexRef                                               = 114,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunch                                                       = 115,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid                                                   = 116,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync                                              = 117,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreate                                                  = 118,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord                                                  = 119,
+    CUPTI_DRIVER_TRACE_CBID_cuEventQuery                                                   = 120,
+    CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize                                             = 121,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy                                                 = 122,
+    CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime                                             = 123,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreate                                                 = 124,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery                                                  = 125,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize                                            = 126,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy                                                = 127,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnregisterResource                                   = 128,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsSubResourceGetMappedArray                            = 129,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer                             = 130,
+    CUPTI_DRIVER_TRACE_CBID_cu64GraphicsResourceGetMappedPointer                           = 131,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags                                  = 132,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources                                         = 133,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources                                       = 134,
+    CUPTI_DRIVER_TRACE_CBID_cuGetExportTable                                               = 135,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetLimit                                                  = 136,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetLimit                                                  = 137,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevice                                               = 138,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate                                               = 139,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D10RegisterResource                                = 140,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10RegisterResource                                        = 141,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnregisterResource                                      = 142,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10MapResources                                            = 143,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnmapResources                                          = 144,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceSetMapFlags                                     = 145,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedArray                                  = 146,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer                                = 147,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize                                   = 148,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch                                  = 149,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions                            = 150,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevice                                               = 151,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate                                               = 152,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D11RegisterResource                                = 153,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevice                                                = 154,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate                                                = 155,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D9RegisterResource                                 = 156,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDirect3DDevice                                        = 157,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterResource                                         = 158,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterResource                                       = 159,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapResources                                             = 160,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapResources                                           = 161,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceSetMapFlags                                      = 162,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions                             = 163,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedArray                                   = 164,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer                                 = 165,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize                                    = 166,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch                                   = 167,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9Begin                                                    = 168,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9End                                                      = 169,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterVertexBuffer                                     = 170,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer                                          = 171,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapVertexBuffer                                        = 172,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterVertexBuffer                                   = 173,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate                                                  = 174,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterBuffer                                     = 175,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterImage                                      = 176,
+    CUPTI_DRIVER_TRACE_CBID_cuWGLGetDevice                                                 = 177,
+    CUPTI_DRIVER_TRACE_CBID_cuGLInit                                                       = 178,
+    CUPTI_DRIVER_TRACE_CBID_cuGLRegisterBufferObject                                       = 179,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject                                            = 180,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObject                                          = 181,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnregisterBufferObject                                     = 182,
+    CUPTI_DRIVER_TRACE_CBID_cuGLSetBufferObjectMapFlags                                    = 183,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync                                       = 184,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObjectAsync                                     = 185,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUGetDevice                                               = 186,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate                                               = 187,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterVideoSurface                            = 188,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterOutputSurface                           = 189,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetSurfRef                                             = 190,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefCreate                                                = 191,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefDestroy                                               = 192,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetFormat                                             = 193,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetArray                                              = 194,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetFormat                                             = 195,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetArray                                              = 196,
+    CUPTI_DRIVER_TRACE_CBID_cu64DeviceTotalMem                                             = 197,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPointer                              = 198,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedSize                                 = 199,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPitch                                = 200,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetSurfaceDimensions                          = 201,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetSurfaceDimensions                           = 202,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPointer                               = 203,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedSize                                  = 204,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPitch                                 = 205,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9MapVertexBuffer                                        = 206,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject                                          = 207,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObjectAsync                                     = 208,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevices                                              = 209,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreateOnDevice                                       = 210,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevices                                              = 211,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreateOnDevice                                       = 212,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevices                                               = 213,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreateOnDevice                                        = 214,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostAlloc                                               = 215,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async                                                = 216,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8Async                                              = 217,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async                                               = 218,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16Async                                             = 219,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async                                               = 220,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32Async                                             = 221,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async                                              = 222,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8Async                                            = 223,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async                                             = 224,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16Async                                           = 225,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async                                             = 226,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32Async                                           = 227,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayCreate                                                = 228,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayGetDescriptor                                         = 229,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DCreate                                              = 230,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DGetDescriptor                                       = 231,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2D                                                   = 232,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DUnaligned                                          = 233,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DAsync                                              = 234,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v2                                                 = 235,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate_v2                                            = 236,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate_v2                                            = 237,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate_v2                                             = 238,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate_v2                                               = 239,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate_v2                                            = 240,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal_v2                                           = 241,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo_v2                                                = 242,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2                                                  = 243,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2                                             = 244,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2                                                   = 245,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange_v2                                        = 246,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer_v2                                   = 247,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_v2                                                    = 248,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2                                                  = 249,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2                                                 = 250,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2                                                 = 251,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2                                                = 252,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2                                               = 253,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2                                               = 254,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress_v2                                          = 255,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v2                                        = 256,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress_v2                                          = 257,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer_v2                          = 258,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem_v2                                            = 259,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer_v2                             = 260,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize_v2                                = 261,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch_v2                               = 262,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions_v2                         = 263,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions_v2                          = 264,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer_v2                              = 265,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize_v2                                 = 266,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch_v2                                = 267,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer_v2                                       = 268,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2                                         = 269,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2                                    = 270,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc_v2                                              = 271,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate_v2                                               = 272,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor_v2                                        = 273,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate_v2                                             = 274,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor_v2                                      = 275,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2                                                = 276,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2                                           = 277,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2                                                = 278,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2                                           = 279,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2                                                = 280,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2                                           = 281,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2                                                = 282,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2                                           = 283,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2                                                = 284,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2                                                = 285,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2                                                = 286,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2                                                  = 287,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2                                         = 288,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2                                             = 289,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2                                                  = 290,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2                                             = 291,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2                                                = 292,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2                                           = 293,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost_v2                                              = 294,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent                                              = 295,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetApiVersion                                             = 296,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDirect3DDevice                                       = 297,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDirect3DDevice                                       = 298,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCacheConfig                                            = 299,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCacheConfig                                            = 300,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister                                              = 301,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostUnregister                                            = 302,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent                                                = 303,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCurrent                                                = 304,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy                                                       = 305,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync                                                  = 306,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel                                                 = 307,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStart                                                = 308,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStop                                                 = 309,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttribute                                          = 310,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerInitialize                                           = 311,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceCanAccessPeer                                          = 312,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxEnablePeerAccess                                          = 313,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDisablePeerAccess                                         = 314,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerRegister                                              = 315,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerUnregister                                            = 316,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerGetDevicePointer                                      = 317,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer                                                   = 318,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync                                              = 319,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer                                                 = 320,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync                                            = 321,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2                                                = 322,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent_v2                                            = 323,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent_v2                                             = 324,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2                                              = 325,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy_v2                                             = 326,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v3                                        = 327,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetMemHandle                                              = 328,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle                                             = 329,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcCloseMemHandle                                            = 330,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetByPCIBusId                                          = 331,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetPCIBusId                                            = 332,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices                                                 = 333,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetEventHandle                                            = 334,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenEventHandle                                           = 335,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetSharedMemConfig                                        = 336,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetSharedMemConfig                                        = 337,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedMemConfig                                       = 338,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectCreate                                              = 339,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectDestroy                                             = 340,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceDesc                                     = 341,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetTextureDesc                                      = 342,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectCreate                                             = 343,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectDestroy                                            = 344,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectGetResourceDesc                                    = 345,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback                                            = 346,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayCreate                                         = 347,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetLevel                                       = 348,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayDestroy                                        = 349,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmappedArray                                      = 350,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapFilterMode                                    = 351,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelBias                                     = 352,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelClamp                                    = 353,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMaxAnisotropy                                       = 354,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmappedArray                                      = 355,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapFilterMode                                    = 356,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelBias                                     = 357,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelClamp                                    = 358,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMaxAnisotropy                                       = 359,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedMipmappedArray                      = 360,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceViewDesc                                 = 361,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate                                                   = 362,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData                                                  = 363,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile                                                  = 364,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkComplete                                                 = 365,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkDestroy                                                  = 366,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreateWithPriority                                     = 367,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority                                            = 368,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags                                               = 369,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetStreamPriorityRange                                    = 370,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocManaged                                              = 371,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorString                                               = 372,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorName                                                 = 373,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessor                    = 374,
+    CUPTI_DRIVER_TRACE_CBID_cuCompilePtx                                                   = 375,
+    CUPTI_DRIVER_TRACE_CBID_cuBinaryFree                                                   = 376,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync                                         = 377,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerSetAttribute                                          = 378,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister_v2                                           = 379,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags_v2                               = 380,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate_v2                                                = 381,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData_v2                                               = 382,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile_v2                                               = 383,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSize                               = 384,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices_v2                                              = 385,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRetain                                       = 386,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease                                      = 387,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags                                     = 388,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset                                        = 389,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsEGLRegisterImage                                     = 390,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetFlags                                                  = 391,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxGetState                                     = 392,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnect                                     = 393,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerDisconnect                                  = 394,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerAcquireFrame                                = 395,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerReleaseFrame                                = 396,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2_ptds                                           = 397,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2_ptds                                           = 398,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2_ptds                                           = 399,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2_ptds                                           = 400,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2_ptds                                           = 401,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2_ptds                                           = 402,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2_ptds                                           = 403,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2_ptds                                           = 404,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2_ptds                                             = 405,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2_ptds                                    = 406,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2_ptds                                             = 407,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_ptds                                                  = 408,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer_ptds                                              = 409,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer_ptds                                            = 410,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2_ptds                                             = 411,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2_ptds                                            = 412,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2_ptds                                            = 413,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2_ptds                                           = 414,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2_ptds                                          = 415,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2_ptds                                          = 416,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2_ptds                                    = 417,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync_ptsz                                             = 418,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2_ptsz                                      = 419,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2_ptsz                                      = 420,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2_ptsz                                      = 421,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2_ptsz                                      = 422,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2_ptsz                                      = 423,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2_ptsz                                        = 424,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2_ptsz                                        = 425,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync_ptsz                                         = 426,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync_ptsz                                       = 427,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async_ptsz                                           = 428,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async_ptsz                                          = 429,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async_ptsz                                          = 430,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async_ptsz                                         = 431,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async_ptsz                                        = 432,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async_ptsz                                        = 433,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority_ptsz                                       = 434,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags_ptsz                                          = 435,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent_ptsz                                         = 436,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback_ptsz                                       = 437,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync_ptsz                                    = 438,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery_ptsz                                             = 439,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize_ptsz                                       = 440,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord_ptsz                                             = 441,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz                                            = 442,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources_ptsz                                    = 443,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources_ptsz                                  = 444,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2_ptsz                               = 445,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerConnect                                     = 446,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerDisconnect                                  = 447,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerPresentFrame                                = 448,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedEglFrame                            = 449,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttributes                                         = 450,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags           = 451,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSizeWithFlags                      = 452,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerReturnFrame                                 = 453,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetP2PAttribute                                        = 454,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetBorderColor                                         = 455,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetBorderColor                                         = 456,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAdvise                                                    = 457,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32                                            = 458,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_ptsz                                       = 459,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32                                           = 460,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_ptsz                                      = 461,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp                                             = 462,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_ptsz                                        = 463,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNbufferGetPointer                                          = 464,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNtextureGetArray                                           = 465,
+    CUPTI_DRIVER_TRACE_CBID_cuNNSetAllocator                                               = 466,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync                                             = 467,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_ptsz                                        = 468,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromNVNSync                                       = 469,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnectWithFlags                            = 470,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttribute                                         = 471,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttributes                                        = 472,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64                                            = 473,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_ptsz                                       = 474,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64                                           = 475,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_ptsz                                      = 476,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel                                      = 477,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz                                 = 478,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromEGLSync                                       = 479,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice                           = 480,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetAttribute                                             = 481,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid                                                = 482,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx                                                 = 483,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_ptsz                                            = 484,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalMemory                                         = 485,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedBuffer                                = 486,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedMipmappedArray                        = 487,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalMemory                                        = 488,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalSemaphore                                      = 489,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync                                = 490,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync_ptsz                           = 491,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync                                  = 492,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync_ptsz                             = 493,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalSemaphore                                     = 494,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture                                           = 495,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_ptsz                                      = 496,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture                                             = 497,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture_ptsz                                        = 498,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing                                            = 499,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing_ptsz                                       = 500,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphCreate                                                  = 501,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode                                           = 502,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams                                     = 503,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemcpyNode                                           = 504,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeGetParams                                     = 505,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemsetNode                                           = 506,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeGetParams                                     = 507,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeSetParams                                     = 508,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetType                                             = 509,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetRootNodes                                            = 510,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies                                     = 511,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes                                   = 512,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate                                             = 513,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch                                                  = 514,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz                                             = 515,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecDestroy                                             = 516,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroy                                                 = 517,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies                                         = 518,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies                                      = 519,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeSetParams                                     = 520,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams                                     = 521,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroyNode                                             = 522,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphClone                                                   = 523,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeFindInClone                                         = 524,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddChildGraphNode                                       = 525,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEmptyNode                                            = 526,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc                                               = 527,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc_ptsz                                          = 528,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphChildGraphNodeGetGraph                                  = 529,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddHostNode                                             = 530,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeGetParams                                       = 531,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetLuid                                                = 532,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeSetParams                                       = 533,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetNodes                                                = 534,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges                                                = 535,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo                                         = 536,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_ptsz                                    = 537,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams                                 = 538,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2                                        = 539,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2_ptsz                                   = 540,
+    CUPTI_DRIVER_TRACE_CBID_cuThreadExchangeStreamCaptureMode                              = 541,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetNvSciSyncAttributes                                 = 542,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyAvailableDynamicSMemPerBlock                        = 543,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease_v2                                   = 544,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset_v2                                     = 545,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags_v2                                  = 546,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressReserve                                            = 547,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressFree                                               = 548,
+    CUPTI_DRIVER_TRACE_CBID_cuMemCreate                                                    = 549,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRelease                                                   = 550,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMap                                                       = 551,
+    CUPTI_DRIVER_TRACE_CBID_cuMemUnmap                                                     = 552,
+    CUPTI_DRIVER_TRACE_CBID_cuMemSetAccess                                                 = 553,
+    CUPTI_DRIVER_TRACE_CBID_cuMemExportToShareableHandle                                   = 554,
+    CUPTI_DRIVER_TRACE_CBID_cuMemImportFromShareableHandle                                 = 555,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationGranularity                                  = 556,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationPropertiesFromHandle                         = 557,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAccess                                                 = 558,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags                                               = 559,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags_ptsz                                          = 560,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate                                              = 561,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemcpyNodeSetParams                                 = 562,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemsetNodeSetParams                                 = 563,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecHostNodeSetParams                                   = 564,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRetainAllocationHandle                                    = 565,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetModule                                                = 566,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle_v2                                          = 567,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxResetPersistingL2Cache                                    = 568,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeCopyAttributes                                = 569,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetAttribute                                  = 570,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetAttribute                                  = 571,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes                                         = 572,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes_ptsz                                    = 573,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute                                           = 574,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute_ptsz                                      = 575,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute                                           = 576,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute_ptsz                                      = 577,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate_v2                                          = 578,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetTexture1DLinearMaxWidth                             = 579,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload                                                  = 580,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload_ptsz                                             = 581,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetSparseProperties                                     = 582,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetSparseProperties                            = 583,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync                                             = 584,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync_ptsz                                        = 585,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecChildGraphNodeSetParams                             = 586,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags                                         = 587,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags_ptsz                                    = 588,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventRecordNode                                      = 589,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventWaitNode                                        = 590,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeGetEvent                                 = 591,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeGetEvent                                   = 592,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeSetEvent                                 = 593,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeSetEvent                                   = 594,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventRecordNodeSetEvent                             = 595,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventWaitNodeSetEvent                               = 596,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetPlane                                                = 597,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync                                                = 598,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync_ptsz                                           = 599,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync                                                 = 600,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync_ptsz                                            = 601,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolTrimTo                                                = 602,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAttribute                                          = 603,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAttribute                                          = 604,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAccess                                             = 605,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDefaultMemPool                                      = 606,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolCreate                                                = 607,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolDestroy                                               = 608,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetMemPool                                             = 609,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetMemPool                                             = 610,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync                                        = 611,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync_ptsz                                   = 612,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportToShareableHandle                               = 613,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportFromShareableHandle                             = 614,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportPointer                                         = 615,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportPointer                                         = 616,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAccess                                             = 617,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresSignalNode                         = 618,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeGetParams                   = 619,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeSetParams                   = 620,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresWaitNode                           = 621,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeGetParams                     = 622,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeSetParams                     = 623,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresSignalNodeSetParams               = 624,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresWaitNodeSetParams                 = 625,
+    CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress                                               = 626,
+    CUPTI_DRIVER_TRACE_CBID_cuFlushGPUDirectRDMAWrites                                     = 627,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDebugDotPrint                                           = 628,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2                                      = 629,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2_ptsz                                 = 630,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies                              = 631,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_ptsz                         = 632,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectCreate                                             = 633,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRetain                                             = 634,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRelease                                            = 635,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRetainUserObject                                        = 636,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphReleaseUserObject                                       = 637,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemAllocNode                                         = 638,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemFreeNode                                          = 639,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGraphMemTrim                                           = 640,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetGraphMemAttribute                                   = 641,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetGraphMemAttribute                                   = 642,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags                                    = 643,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetExecAffinitySupport                                 = 644,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v3                                                 = 645,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetExecAffinity                                           = 646,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid_v2                                             = 647,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemAllocNodeGetParams                                   = 648,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemFreeNodeGetParams                                    = 649,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetEnabled                                          = 650,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetEnabled                                          = 651,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx                                               = 652,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz                                          = 653,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetMemoryRequirements                                   = 654,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetMemoryRequirements                          = 655,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams                                   = 656,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz                              = 657,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecGetFlags                                            = 658,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2                                         = 659,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2_ptsz                                    = 660,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2                                         = 661,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2_ptsz                                    = 662,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2                                        = 663,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2_ptsz                                   = 664,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2                                        = 665,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2_ptsz                                   = 666,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2                                          = 667,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2_ptsz                                     = 668,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddBatchMemOpNode                                       = 669,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeGetParams                                 = 670,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeSetParams                                 = 671,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecBatchMemOpNodeSetParams                             = 672,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetLoadingMode                                         = 673,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetHandleForAddressRange                                  = 674,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialClusterSize                             = 675,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveClusters                                   = 676,
+    CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress_v2                                            = 677,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryLoadData                                              = 678,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryLoadFromFile                                          = 679,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryUnload                                                = 680,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetKernel                                             = 681,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetModule                                             = 682,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetFunction                                            = 683,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetGlobal                                             = 684,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetManaged                                            = 685,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetAttribute                                           = 686,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelSetAttribute                                           = 687,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelSetCacheConfig                                         = 688,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode_v2                                        = 689,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams_v2                                  = 690,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams_v2                                  = 691,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams_v2                              = 692,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetId                                                  = 693,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetId_ptsz                                             = 694,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetId                                                     = 695,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate_v2                                           = 696,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapEncodeTiled                                         = 697,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapEncodeIm2col                                        = 698,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapReplaceAddress                                      = 699,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetUnifiedFunction                                    = 700,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpGetAttribute                                         = 701,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpGetAttributeGlobal                                   = 702,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpSetAttribute                                         = 703,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpSetAttributeGlobal                                   = 704,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetFlags                                                  = 705,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastCreate                                              = 706,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastAddDevice                                           = 707,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastBindMem                                             = 708,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastBindAddr                                            = 709,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastUnbind                                              = 710,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastGetGranularity                                      = 711,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode                                                 = 712,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetParams                                           = 713,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecNodeSetParams                                       = 714,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAdvise_v2                                                 = 715,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_v2                                          = 716,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_v2_ptsz                                     = 717,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetName                                                  = 718,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetName                                                = 719,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCaptureToGraph                                    = 720,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCaptureToGraph_ptsz                               = 721,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphConditionalHandleCreate                                 = 722,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode_v2                                              = 723,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges_v2                                             = 724,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies_v2                                  = 725,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes_v2                                = 726,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies_v2                                      = 727,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies_v2                                   = 728,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v3                                      = 729,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v3_ptsz                                 = 730,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_v2                           = 731,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_v2_ptsz                      = 732,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetParamInfo                                             = 733,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetParamInfo                                           = 734,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceRegisterAsyncNotification                              = 735,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceUnregisterAsyncNotification                            = 736,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunctionCount                                       = 737,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleEnumerateFunctions                                     = 738,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetKernelCount                                        = 739,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryEnumerateKernels                                      = 740,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncIsLoaded                                                 = 741,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncLoad                                                     = 742,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxCreate                                               = 743,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxDestroy                                              = 744,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDevResource                                         = 745,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevResource                                            = 746,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxGetDevResource                                       = 747,
+    CUPTI_DRIVER_TRACE_CBID_cuDevResourceGenerateDesc                                      = 748,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxRecordEvent                                          = 749,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxWaitEvent                                            = 750,
+    CUPTI_DRIVER_TRACE_CBID_cuDevSmResourceSplitByCount                                    = 751,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetGreenCtx                                            = 752,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxFromGreenCtx                                              = 753,
+    CUPTI_DRIVER_TRACE_CBID_SIZE                                                           = 754,
+    CUPTI_DRIVER_TRACE_CBID_FORCE_INT                                                      = 0x7fffffff
+} CUpti_driver_api_trace_cbid;
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_metrics.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..28d441e6b51a1be18f22a018800316fda0a779ec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_metrics.h
@@ -0,0 +1,825 @@
+/*
+ * Copyright 2011-2020   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_METRIC_H_)
+#define _CUPTI_METRIC_H_
+
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_METRIC_API CUPTI Metric API
+ * Functions, types, and enums that implement the CUPTI Metric API.
+ *
+ * \note CUPTI metric API from the header cupti_metrics.h are not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * These API will be deprecated in a future CUDA release. These are replaced by
+ * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+ * in the headers nvperf_host.h and nvperf_target.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
+ * architectures).
+ *
+ * @{
+ */
+
+/**
+ * \brief ID for a metric.
+ *
+ * A metric provides a measure of some aspect of the device.
+ */
+typedef uint32_t CUpti_MetricID;
+
+/**
+ * \brief A metric category.
+ *
+ * Each metric is assigned to a category that represents the general
+ * type of the metric. A metric's category is accessed using \ref
+ * cuptiMetricGetAttribute and the CUPTI_METRIC_ATTR_CATEGORY
+ * attribute.
+ */
+typedef enum {
+  /**
+   * A memory related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MEMORY          = 0,
+  /**
+   * An instruction related metric.
+   */
+  CUPTI_METRIC_CATEGORY_INSTRUCTION     = 1,
+  /**
+   * A multiprocessor related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MULTIPROCESSOR  = 2,
+  /**
+   * A cache related metric.
+   */
+  CUPTI_METRIC_CATEGORY_CACHE           = 3,
+  /**
+   * A texture related metric.
+   */
+  CUPTI_METRIC_CATEGORY_TEXTURE         = 4,
+  /**
+   *A Nvlink related metric.
+   */
+  CUPTI_METRIC_CATEGORY_NVLINK          = 5,
+  /**
+   *A PCIe related metric.
+   */
+  CUPTI_METRIC_CATEGORY_PCIE           = 6,
+  CUPTI_METRIC_CATEGORY_FORCE_INT                         = 0x7fffffff,
+} CUpti_MetricCategory;
+
+/**
+ * \brief A metric evaluation mode.
+ *
+ * A metric can be evaluated per hardware instance to know the load balancing
+ * across instances of a domain or the metric can be evaluated in aggregate mode
+ * when the events involved in metric evaluation are from different event
+ * domains. It might be possible to evaluate some metrics in both
+ * modes for convenience. A metric's evaluation mode is accessed using \ref
+ * CUpti_MetricEvaluationMode and the CUPTI_METRIC_ATTR_EVALUATION_MODE
+ * attribute.
+ */
+typedef enum {
+  /**
+   * If this bit is set, the metric can be profiled for each instance of the
+   * domain. The event values passed to \ref cuptiMetricGetValue can contain
+   * values for one instance of the domain. And \ref cuptiMetricGetValue can
+   * be called for each instance.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE         = 1,
+  /**
+   * If this bit is set, the metric can be profiled over all instances. The
+   * event values passed to \ref cuptiMetricGetValue can be aggregated values
+   * of events for all instances of the domain.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_AGGREGATE            = 1 << 1,
+  CUPTI_METRIC_EVALUATION_MODE_FORCE_INT            = 0x7fffffff,
+} CUpti_MetricEvaluationMode;
+
+/**
+ * \brief Kinds of metric values.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef enum {
+  /**
+   * The metric value is a 64-bit double.
+   */
+  CUPTI_METRIC_VALUE_KIND_DOUBLE            = 0,
+  /**
+   * The metric value is a 64-bit unsigned integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_UINT64            = 1,
+  /**
+   * The metric value is a percentage represented by a 64-bit
+   * double. For example, 57.5% is represented by the value 57.5.
+   */
+  CUPTI_METRIC_VALUE_KIND_PERCENT           = 2,
+  /**
+   * The metric value is a throughput represented by a 64-bit
+   * integer. The unit for throughput values is bytes/second.
+   */
+  CUPTI_METRIC_VALUE_KIND_THROUGHPUT        = 3,
+  /**
+   * The metric value is a 64-bit signed integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_INT64             = 4,
+  /**
+   * The metric value is a utilization level, as represented by
+   * CUpti_MetricValueUtilizationLevel.
+   */
+  CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL = 5,
+
+  CUPTI_METRIC_VALUE_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_MetricValueKind;
+
+/**
+ * \brief Enumeration of utilization levels for metrics values of kind
+ * CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL. Utilization values can
+ * vary from IDLE (0) to MAX (10) but the enumeration only provides
+ * specific names for a few values.
+ */
+typedef enum {
+  CUPTI_METRIC_VALUE_UTILIZATION_IDLE      = 0,
+  CUPTI_METRIC_VALUE_UTILIZATION_LOW       = 2,
+  CUPTI_METRIC_VALUE_UTILIZATION_MID       = 5,
+  CUPTI_METRIC_VALUE_UTILIZATION_HIGH      = 8,
+  CUPTI_METRIC_VALUE_UTILIZATION_MAX       = 10,
+  CUPTI_METRIC_VALUE_UTILIZATION_FORCE_INT = 0x7fffffff
+} CUpti_MetricValueUtilizationLevel;
+
+/**
+ * \brief Metric attributes.
+ *
+ * Metric attributes describe properties of a metric. These attributes
+ * can be read using \ref cuptiMetricGetAttribute.
+ */
+typedef enum {
+  /**
+   * Metric name. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_NAME              = 0,
+  /**
+   * Short description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of the metric. Value is of type CUpti_MetricCategory.
+   */
+  CUPTI_METRIC_ATTR_CATEGORY          = 3,
+  /**
+   * Value type of the metric. Value is of type CUpti_MetricValueKind.
+   */
+  CUPTI_METRIC_ATTR_VALUE_KIND          = 4,
+  /**
+   * Metric evaluation mode. Value is of type CUpti_MetricEvaluationMode.
+   */
+  CUPTI_METRIC_ATTR_EVALUATION_MODE     = 5,
+  CUPTI_METRIC_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_MetricAttribute;
+
+/**
+ * \brief A metric value.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef union {
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_DOUBLE.
+   */
+  double metricValueDouble;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UINT64.
+   */
+  uint64_t metricValueUint64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_INT64.
+   */
+  int64_t metricValueInt64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_PERCENT. For example, 57.5% is
+   * represented by the value 57.5.
+   */
+  double metricValuePercent;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_THROUGHPUT.  The unit for
+   * throughput values is bytes/second.
+   */
+  uint64_t metricValueThroughput;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL.
+   */
+  CUpti_MetricValueUtilizationLevel metricValueUtilizationLevel;
+} CUpti_MetricValue;
+
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for metric property
+ * CUPTI_METRIC_PROPERTY_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TESLA          = 0,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_QUADRO         = 1,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_GEFORCE        = 2,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TEGRA          = 3,
+} CUpti_MetricPropertyDeviceClass;
+
+/**
+ * \brief Metric device properties.
+ *
+ * Metric device properties describe device properties which are needed for a metric.
+ * Some of these properties can be collected using cuDeviceGetAttribute.
+ */
+typedef enum {
+  /*
+   * Number of multiprocessors on a device.  This can be collected
+   * using value of \param CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_MULTIPROCESSOR_COUNT,
+  /*
+   * Maximum number of warps on a multiprocessor. This can be
+   * collected using ratio of value of \param
+   * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR and \param
+   * CU_DEVICE_ATTRIBUTE_WARP_SIZE of cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_WARPS_PER_MULTIPROCESSOR,
+  /*
+   * GPU Time for kernel in ns. This should be profiled using CUPTI
+   * Activity API.
+   */
+  CUPTI_METRIC_PROPERTY_KERNEL_GPU_TIME,
+  /*
+   * Clock rate for device in KHz.  This should be collected using
+   * value of \param CU_DEVICE_ATTRIBUTE_CLOCK_RATE of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_CLOCK_RATE,
+  /*
+   * Number of Frame buffer units for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTRIBUTE_MAX_FRAME_BUFFERS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FRAME_BUFFER_COUNT,
+  /*
+   * Global memory bandwidth in KBytes/sec. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH
+   * of cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GLOBAL_MEMORY_BANDWIDTH,
+  /*
+   * PCIE link rate in Mega bits/sec. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_RATE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_RATE,
+  /*
+   * PCIE link width for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_WIDTH,
+  /*
+   * PCIE generation for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_GEN of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_GEN,
+  /*
+   * The device class. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_DEVICE_CLASS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS,
+  /*
+   * Peak single precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_SP_PER_CYCLE,
+  /*
+   * Peak double precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_DP_PER_CYCLE,
+  /*
+   * Number of L2 units on a device. This can be collected
+   * using value of \param CUPTI_DEVICE_ATTR_MAX_L2_UNITS of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_L2_UNITS,
+  /*
+   * Whether ECC support is enabled on the device. This can be
+   * collected using value of \param CU_DEVICE_ATTRIBUTE_ECC_ENABLED of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_ECC_ENABLED,
+  /*
+   * Peak half precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_HP_PER_CYCLE,
+  /*
+   * NVLINK Bandwitdh for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GPU_CPU_NVLINK_BANDWIDTH,
+} CUpti_MetricPropertyID;
+
+/**
+ * \brief Get the total number of metrics available on any device.
+ *
+ * Returns the total number of metrics available on any CUDA-capable
+ * devices.
+ *
+ * \param numMetrics Returns the number of metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics);
+
+/**
+ * \brief Get all the metrics available on any device.
+ *
+ * Returns the metric IDs in \p metricArray for all CUDA-capable
+ * devices.  The size of the \p metricArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p metricArray buffer must be at
+ * least \p numMetrics * sizeof(CUpti_MetricID) or all metric IDs will
+ * not be returned. The value returned in \p *arraySizeBytes contains
+ * the number of bytes returned in \p metricArray.
+ *
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+*/
+CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
+                                      CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get the number of metrics for a device.
+ *
+ * Returns the number of metrics available for a device.
+ *
+ * \param device The CUDA device
+ * \param numMetrics Returns the number of metrics available for the
+ * device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
+                                              uint32_t *numMetrics);
+
+/**
+ * \brief Get the metrics for a device.
+ *
+ * Returns the metric IDs in \p metricArray for a device.  The size of
+ * the \p metricArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p metricArray buffer must be at least \p numMetrics *
+ * sizeof(CUpti_MetricID) or else all metric IDs will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p metricArray.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
+                                            size_t *arraySizeBytes,
+                                            CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get a metric attribute.
+ *
+ * Returns a metric attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ *
+ * \param metric ID of the metric
+ * \param attrib The metric attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a metric attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
+                                             CUpti_MetricAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+
+/**
+ * \brief Find an metric by name.
+ *
+ * Find a metric by name and return the metric ID in \p *metric.
+ *
+ * \param device The CUDA device
+ * \param metricName The name of metric to find
+ * \param metric Returns the ID of the found metric or undefined if
+ * unable to find the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if unable to find a metric
+ * with name \p metricName. In this case \p *metric is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricName or \p
+ * metric are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
+                                              const char *metricName,
+                                              CUpti_MetricID *metric);
+
+/**
+ * \brief Get number of events required to calculate a metric.
+ *
+ * Returns the number of events in \p numEvents that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numEvents Returns the number of events required for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
+                                             uint32_t *numEvents);
+
+/**
+ * \brief Get the events required to calculating a metric.
+ *
+ * Gets the event IDs in \p eventIdArray required to calculate a \p
+ * metric. The size of the \p eventIdArray buffer is given by \p
+ * *eventIdArraySizeBytes and must be at least \p numEvents *
+ * sizeof(CUpti_EventID) or all events will not be returned. The value
+ * returned in \p *eventIdArraySizeBytes contains the number of bytes
+ * returned in \p eventIdArray.
+ *
+ * \param metric ID of the metric
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArraySizeBytes or \p
+ * eventIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
+                                           size_t *eventIdArraySizeBytes,
+                                           CUpti_EventID *eventIdArray);
+
+/**
+ * \brief Get number of properties required to calculate a metric.
+ *
+ * Returns the number of properties in \p numProp that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numProp Returns the number of properties required for the
+ * metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numProp is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
+                                                 uint32_t *numProp);
+
+/**
+ * \brief Get the properties required to calculating a metric.
+ *
+ * Gets the property IDs in \p propIdArray required to calculate a \p
+ * metric. The size of the \p propIdArray buffer is given by \p
+ * *propIdArraySizeBytes and must be at least \p numProp *
+ * sizeof(CUpti_DeviceAttribute) or all properties will not be
+ * returned. The value returned in \p *propIdArraySizeBytes contains
+ * the number of bytes returned in \p propIdArray.
+ *
+ * \param metric ID of the metric
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes,
+ * and returns the number of bytes written to \p propIdArray
+ * \param propIdArray Returns the IDs of the properties required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p propIdArraySizeBytes or \p
+ * propIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
+                                               size_t *propIdArraySizeBytes,
+                                               CUpti_MetricPropertyID *propIdArray);
+
+
+/**
+ * \brief For a metric get the groups of events that must be collected
+ * in the same pass.
+ *
+ * For a metric get the groups of events that must be collected in the
+ * same pass to ensure that the metric is calculated correctly. If the
+ * events are not collected as specified then the metric value may be
+ * inaccurate.
+ *
+ * The function returns NULL if a metric does not have any required
+ * event group. In this case the events needed for the metric can be
+ * grouped in any manner for collection.
+ *
+ * \param context The context for event collection
+ * \param metric The metric ID
+ * \param eventGroupSets Returns a CUpti_EventGroupSets object that
+ * indicates the events that must be collected in the same pass to
+ * ensure the metric is calculated correctly.  Returns NULL if no
+ * grouping is required for metric
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ */
+CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
+                                                          CUpti_MetricID metric,
+                                                          CUpti_EventGroupSets **eventGroupSets);
+
+/**
+ * \brief For a set of metrics, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events required for those metrics.
+ *
+ * For a set of metrics, get the grouping that indicates the number of
+ * passes and the event groups necessary to collect the events
+ * required for those metrics.
+ *
+ * \see cuptiEventGroupSetsCreate for details on event group set
+ * creation.
+ *
+ * \param context The context for event collection
+ * \param metricIdArraySizeBytes Size of the metricIdArray in bytes
+ * \param metricIdArray Array of metric IDs
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
+                                                     size_t metricIdArraySizeBytes,
+                                                     CUpti_MetricID *metricIdArray,
+                                                     CUpti_EventGroupSets **eventGroupPasses);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events collected for a metric to calculate the metric
+ * value. Metric value evaluation depends on the evaluation mode
+ * \ref CUpti_MetricEvaluationMode that the metric supports.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE,
+ * then it assumes that the input event value is for one domain instance.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE,
+ * it assumes that input event values are
+ * normalized to represent all domain instances on a device. For the
+ * most accurate metric collection, the events required for the metric
+ * should be collected for all profiled domain instances. For example,
+ * to collect all instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param device The CUDA device that the metric is being calculated for
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param timeDuration The duration over which the events were
+ * collected, in ns
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_INVALID_METRIC_VALUE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
+                                         CUpti_MetricID metric,
+                                         size_t eventIdArraySizeBytes,
+                                         CUpti_EventID *eventIdArray,
+                                         size_t eventValueArraySizeBytes,
+                                         uint64_t *eventValueArray,
+                                         uint64_t timeDuration,
+                                         CUpti_MetricValue *metricValue);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events and properties collected for a metric to calculate
+ * the metric value. Metric value evaluation depends on the evaluation
+ * mode \ref CUpti_MetricEvaluationMode that the metric supports.  If
+ * a metric has evaluation mode as
+ * CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, then it assumes that the
+ * input event value is for one domain instance.  If a metric has
+ * evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, it
+ * assumes that input event values are normalized to represent all
+ * domain instances on a device. For the most accurate metric
+ * collection, the events required for the metric should be collected
+ * for all profiled domain instances. For example, to collect all
+ * instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes
+ * \param propIdArray The metric property IDs required to calculate \p metric
+ * \param propValueArraySizeBytes The size of \p propValueArray in bytes
+ * \param propValueArray The metric property values required to
+ * calculate \p metric. The values must be order to match the order of
+ * metric properties in \p propIdArray
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
+                                          size_t eventIdArraySizeBytes,
+                                          CUpti_EventID *eventIdArray,
+                                          size_t eventValueArraySizeBytes,
+                                          uint64_t *eventValueArray,
+                                          size_t propIdArraySizeBytes,
+                                          CUpti_MetricPropertyID *propIdArray,
+                                          size_t propValueArraySizeBytes,
+                                          uint64_t *propValueArray,
+                                          CUpti_MetricValue *metricValue);
+
+/** @} */ /* END CUPTI_METRIC_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_METRIC_H_*/
+
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..af41b55e2226539d69e0631ebb78185399e8b936
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h
@@ -0,0 +1,601 @@
+/*
+ * Copyright 2011-2023   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PROFILER_TARGET_H_)
+#define _CUPTI_PROFILER_TARGET_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PROFILER_API CUPTI Profiling API
+ * Functions, types, and enums that implement the CUPTI Profiling API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+/**
+ * \brief Profiler range attribute
+ *
+ * A metric enabled in the session's configuration is collected separately per unique range-stack in the pass.
+ * This is an attribute to collect metrics around each kernel in a profiling session or in an user defined range.
+ */
+typedef enum
+{
+    /**
+     * Invalid value
+     */
+    CUPTI_Range_INVALID,
+    /**
+     * Ranges are auto defined around each kernel in a profiling session
+     */
+    CUPTI_AutoRange,
+    /**
+     * A range in which metric data to be collected is defined by the user
+     */
+    CUPTI_UserRange,
+    /**
+     * Range count
+     */
+    CUPTI_Range_COUNT,
+} CUpti_ProfilerRange;
+
+/**
+ * \brief Profiler replay attribute
+ *
+ * For metrics which require multipass collection, a replay of the GPU kernel(s) is required.
+ * This is an attribute which specify how the replay of the kernel(s) to be measured is done.
+ */
+typedef enum
+{
+    /**
+     * Invalid Value
+     */
+    CUPTI_Replay_INVALID,
+    /**
+     * Replay is done by CUPTI user around the process
+     */
+    CUPTI_ApplicationReplay,
+    /**
+     * Replay is done around kernel implicitly by CUPTI
+     */
+    CUPTI_KernelReplay,
+    /**
+     * Replay is done by CUPTI user within a process
+     */
+    CUPTI_UserReplay,
+    /**
+     * Replay count
+     */
+    CUPTI_Replay_COUNT,
+} CUpti_ProfilerReplayMode;
+
+/**
+ * \brief Default parameter for cuptiProfilerInitialize
+ */
+typedef struct CUpti_Profiler_Initialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_Initialize_Params;
+#define CUpti_Profiler_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Initialize_Params, pPriv)
+
+/**
+ * \brief Default parameter for cuptiProfilerDeInitialize
+ */
+typedef struct CUpti_Profiler_DeInitialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_DeInitialize_Params;
+#define CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeInitialize_Params, pPriv)
+
+/**
+ * \brief Initializes the profiler interface
+ *
+ * Loads the required libraries in the process address space.
+ * Sets up the hooks with the CUDA driver.
+ */
+CUptiResult CUPTIAPI cuptiProfilerInitialize(CUpti_Profiler_Initialize_Params *pParams);
+
+/**
+ * \brief DeInitializes the profiler interface
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeInitialize(CUpti_Profiler_DeInitialize_Params *pParams);
+
+/**
+ * \brief Input parameter to define the counterDataImage
+ */
+typedef struct CUpti_Profiler_CounterDataImageOptions
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImageOptions_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    const uint8_t* pCounterDataPrefix;                          /**< [in] Address of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+                                                                    Must be align(8).*/
+    size_t counterDataPrefixSize;                               //!< [in] Size of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+    uint32_t maxNumRanges;                                      //!< [in] Maximum number of ranges that can be profiled
+    uint32_t maxNumRangeTreeNodes;                              //!< [in] Maximum number of RangeTree nodes; must be >= maxNumRanges
+    uint32_t maxRangeNameLength;                                //!< [in] Maximum string length of each RangeName, including the trailing NULL character
+} CUpti_Profiler_CounterDataImageOptions;
+#define CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImageOptions, maxRangeNameLength)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateSize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE         CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateSize_Params, counterDataImageSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitialize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_Initialize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [in] Size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                                 //!< [in] The buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_Initialize_Params;
+#define CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_Initialize_Params, pCounterDataImage)
+
+/**
+ * \brief A CounterData image allocates space for values for each counter for each range.
+ *
+ * User borne the resposibility of managing the counterDataImage allocations.
+ * CounterDataPrefix contains meta data about the metrics that will be stored in counterDataImage.
+ * Use these APIs to calculate the allocation size and initialize counterData image.
+ */
+CUptiResult cuptiProfilerCounterDataImageCalculateSize(CUpti_Profiler_CounterDataImage_CalculateSize_Params* pParams);
+CUptiResult cuptiProfilerCounterDataImageInitialize(CUpti_Profiler_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateScratchBufferSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params, counterDataScratchBufferSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitializeScratchBuffer
+ */
+typedef struct CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated using cuptiProfilerCounterDataImageCalculateScratchBufferSize
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] the scratch buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params;
+#define CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params, pCounterDataScratchBuffer)
+
+/**
+ * \brief A temporary storage for CounterData image needed for internal operations
+ *
+ * Use these APIs to calculate the allocation size and initialize counterData image scratch buffer.
+ */
+CUptiResult cuptiProfilerCounterDataImageCalculateScratchBufferSize(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params* pParams);
+CUptiResult cuptiProfilerCounterDataImageInitializeScratchBuffer(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginSession
+ */
+typedef struct CUpti_Profiler_BeginSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in] address of CounterDataImage
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated from cuptiProfilerCounterDataImageInitializeScratchBuffer
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] address of CounterDataImage scratch buffer
+    uint8_t bDumpCounterDataInFile;                          //!< [in] [optional]
+    const char* pCounterDataFilePath;                        //!< [in] [optional]
+    CUpti_ProfilerRange range;                               //!< [in] CUpti_ProfilerRange
+    CUpti_ProfilerReplayMode replayMode;                     //!< [in] CUpti_ProfilerReplayMode
+    /* Replay options, required when replay is done by cupti user */
+    size_t maxRangesPerPass;                                //!< [in] Maximum number of ranges that can be recorded in a single pass.
+    size_t maxLaunchesPerPass;                              //!< [in] Maximum number of kernel launches that can be recorded in a single pass; must be >= maxRangesPerPass.
+
+} CUpti_Profiler_BeginSession_Params;
+#define CUpti_Profiler_BeginSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginSession_Params, maxLaunchesPerPass)
+/**
+ * \brief Params for cuptiProfilerEndSession
+ */
+typedef struct CUpti_Profiler_EndSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EndSession_Params;
+#define CUpti_Profiler_EndSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndSession_Params, ctx)
+
+/**
+ * \brief Begin profiling session sets up the profiling on the device
+ *
+ * Although, it doesn't start the profiling but GPU resources needed for profiling are allocated.
+ * Outside of a session, the GPU will return to its normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerBeginSession(CUpti_Profiler_BeginSession_Params* pParams);
+/**
+ * \brief Ends profiling session
+ *
+ * Frees up the GPU resources acquired for profiling.
+ * Outside of a session, the GPU will return to it's normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEndSession(CUpti_Profiler_EndSession_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerSetConfig
+ */
+typedef struct CUpti_Profiler_SetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_SetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const uint8_t* pConfig;                                 //!< [in] Config created by NVPW_RawMetricsConfig_GetConfigImage(). Must be align(8).
+    size_t configSize;                                      //!< [in] size of config
+    uint16_t minNestingLevel;                               //!< [in] the lowest nesting level to be profiled; must be >= 1
+    uint16_t numNestingLevels;                              //!< [in] the number of nesting levels to profile; must be >= 1
+    size_t passIndex;                                       //!< [in] Set this to zero for in-app replay; set this to the output of EndPass() for application replay
+    uint16_t targetNestingLevel;                            //!< [in] Set this to minNestingLevel for in-app replay; set this to the output of EndPass() for application
+} CUpti_Profiler_SetConfig_Params;
+
+#define CUpti_Profiler_SetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_SetConfig_Params, targetNestingLevel)
+
+/**
+ * \brief Params for cuptiProfilerUnsetConfig
+ */
+typedef struct CUpti_Profiler_UnsetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_UnsetConfig_Params;
+#define CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_UnsetConfig_Params, ctx)
+
+/**
+ * \brief Set metrics configuration to be profiled
+ *
+ * Use these APIs to set the config to profile in a session. It can be used for advanced cases such as where multiple
+ * configurations are collected into a single CounterData Image on the need basis, without restarting the session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerSetConfig(CUpti_Profiler_SetConfig_Params* pParams);
+/**
+ * \brief Unset metrics configuration profiled
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerUnsetConfig(CUpti_Profiler_UnsetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginPass
+ */
+typedef struct CUpti_Profiler_BeginPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_BeginPass_Params;
+#define CUpti_Profiler_BeginPass_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginPass_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerEndPass
+ */
+typedef struct CUpti_Profiler_EndPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    uint16_t targetNestingLevel;                            //!  [out] The targetNestingLevel that will be collected by the *next* BeginPass.
+    size_t passIndex;                                       //!< [out] The passIndex that will be collected by the *next* BeginPass
+    uint8_t allPassesSubmitted;                             //!< [out] becomes true when the last pass has been queued to the GPU
+} CUpti_Profiler_EndPass_Params;
+#define CUpti_Profiler_EndPass_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndPass_Params, allPassesSubmitted)
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * It's a no-op in case of \ref CUPTI_KernelReplay.
+ */
+CUptiResult cuptiProfilerBeginPass(CUpti_Profiler_BeginPass_Params* pParams);
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * Its a no-op in case of \ref CUPTI_KernelReplay.
+ * Returns information for next pass.
+ */
+CUptiResult cuptiProfilerEndPass(CUpti_Profiler_EndPass_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerEnableProfiling
+ */
+typedef struct CUpti_Profiler_EnableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EnableProfiling_Params;
+#define CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EnableProfiling_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerDisableProfiling
+ */
+typedef struct CUpti_Profiler_DisableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_DisableProfiling_Params;
+#define CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DisableProfiling_Params, ctx)
+
+/**
+ * \brief Enables Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEnableProfiling(CUpti_Profiler_EnableProfiling_Params* pParams);
+
+/**
+ * \brief Disable Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDisableProfiling(CUpti_Profiler_DisableProfiling_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerIsPassCollected
+ */
+typedef struct CUpti_Profiler_IsPassCollected_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed pass
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+    uint8_t onePassCollected;                               //!< [out] true if a pass was successfully decoded
+    uint8_t allPassesCollected;                             //!< [out] becomes true when the last pass has been decoded
+} CUpti_Profiler_IsPassCollected_Params;
+#define CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_IsPassCollected_Params, allPassesCollected)
+
+/**
+ * \brief Asynchronous call to query if the submitted pass to GPU is collected
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerIsPassCollected(CUpti_Profiler_IsPassCollected_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerFlushCounterData
+ */
+typedef struct CUpti_Profiler_FlushCounterData_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed passes
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+} CUpti_Profiler_FlushCounterData_Params;
+#define CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_FlushCounterData_Params, numTraceBytesDropped)
+
+/**
+ * \brief Decode all the submitted passes
+ *
+ * Flush Counter data API to ensure every pass is decoded into the counterDataImage passed at beginSession.
+ * This will cause the CPU/GPU sync to collect all the undecoded pass.
+ */
+CUptiResult CUPTIAPI cuptiProfilerFlushCounterData(CUpti_Profiler_FlushCounterData_Params* pParams);
+
+typedef struct CUpti_Profiler_PushRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PushRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const char* pRangeName;                                 //!< [in] specifies the range for subsequent launches; must not be NULL
+    size_t rangeNameLength;                                 //!< [in] assign to strlen(pRangeName) if known; if set to zero, the library will call strlen()
+} CUpti_Profiler_PushRange_Params;
+#define CUpti_Profiler_PushRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PushRange_Params, rangeNameLength)
+
+typedef struct CUpti_Profiler_PopRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PopRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_PopRange_Params;
+#define CUpti_Profiler_PopRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PopRange_Params, ctx)
+
+
+/**
+ * \brief Range API's : Push user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPushRange(CUpti_Profiler_PushRange_Params *pParams);
+
+/**
+ * \brief Range API's : Pop user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPopRange(CUpti_Profiler_PopRange_Params *pParams);
+
+/**
+ * \brief Params for cuptiProfilerGetCounterAvailability
+ */
+typedef struct CUpti_Profiler_GetCounterAvailability_Params
+{
+    size_t structSize;                                  //!< [in] CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE
+    void* pPriv;                                        //!< [in] assign to NULL
+    CUcontext ctx;                                      //!< [in] if NULL, the current CUcontext is used
+    size_t counterAvailabilityImageSize;                //!< [in/out] If `pCounterAvailabilityImage` is NULL, then the required size is returned in
+                                                        //!< `counterAvailabilityImageSize`, otherwise `counterAvailabilityImageSize` should be set to the size of
+                                                        //!< `pCounterAvailabilityImage`, and on return it would be overwritten with number of actual bytes copied
+    uint8_t* pCounterAvailabilityImage;                 //!< [in] buffer receiving counter availability image, may be NULL
+} CUpti_Profiler_GetCounterAvailability_Params;
+#define CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_GetCounterAvailability_Params, pCounterAvailabilityImage)
+
+/**
+ * \brief Query counter availibility
+ *
+ * Use this API to query counter availability information in a buffer which can be used to filter unavailable raw metrics on host.
+ * Note: This API may fail, if any profiling or sampling session is active on the specified context or its device.
+ */
+CUptiResult CUPTIAPI cuptiProfilerGetCounterAvailability(CUpti_Profiler_GetCounterAvailability_Params *pParams);
+
+/// Generic support level enum for CUPTI
+typedef enum
+{
+    CUPTI_PROFILER_CONFIGURATION_UNKNOWN = 0, //!< Configuration support level unknown - either detection code errored out before setting this value, or unable to determine it
+    CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED, //!< Profiling is unavailable.  For specific feature fields, this means that the current configuration of this feature does not work with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would be returned for SLI on an SLI-enabled device.
+    CUPTI_PROFILER_CONFIGURATION_DISABLED,    //!< Profiling would be available for this configuration, but was disabled by the system
+    CUPTI_PROFILER_CONFIGURATION_SUPPORTED    //!< Profiling is supported.  For specific feature fields, this means that the current configuration of this feature works with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would only be returned for devices which are not SLI-enabled.
+} CUpti_Profiler_Support_Level;
+
+/**
+ * \brief Profiler API types
+ */
+typedef enum
+{
+    CUPTI_PROFILER_RANGE_PROFILING = 0,       //!< CUPTI APIs for range based profiling (cuptiProfiler*)
+    CUPTI_PROFILER_PC_SAMPLING,               //!< CUPTI APIs collecting pc sampling data (cuptiPcSampling*)
+    CUPTI_PROFILER_SASS_METRICS,              //!< CUPTI APIs collecting SASS metrics data (cuptiSassMetrics*)
+    CUPTI_PROFILER_UNKNOWN
+} CUpti_Profiler_API;
+
+/**
+ * \brief Params for cuptiProfilerDeviceSupported
+ */
+typedef struct
+{
+    size_t structSize;                                //!< [in] Must be CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE
+    void *pPriv;                                      //!< [in] assign to NULL
+    CUdevice cuDevice;                                //!< [in] if NULL, the current CUcontext is used
+
+    CUpti_Profiler_Support_Level isSupported;         //!< [out] overall SUPPORTED / UNSUPPORTED flag representing whether Profiling and PC Sampling APIs work on the given device and configuration. SUPPORTED if all following flags are SUPPORTED, UNSUPPORTED otherwise.
+
+    CUpti_Profiler_Support_Level architecture;        //!< [out] SUPPORTED if the device architecture level supports the Profiling API (Compute Capability >= 7.0), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level sli;                 //!< [out] SUPPORTED if SLI is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level vGpu;                //!< [out] SUPPORTED if vGPU is supported and profiling is enabled, DISABLED if profiling is supported but not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level confidentialCompute; //!< [out] SUPPORTED if confidential compute is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level cmp;                 //!< [out] SUPPORTED if not NVIDIA Crypto Mining Processors (CMP), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level wsl;                 //!< [out] SUPPORTED if WSL supported, UNSUPPORTED otherwise
+    CUpti_Profiler_API     api;                       //!< [in] the CUPTI API type for which device support will be checked
+} CUpti_Profiler_DeviceSupported_Params;
+#define CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeviceSupported_Params, api)
+
+/**
+ * \brief Query device compatibility with Profiling API
+ *
+ * Use this call to determine whether a compute device and configuration are compatible with the Profiling API.
+ * If the configuration does not support profiling, one of several flags will indicate why.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeviceSupported(CUpti_Profiler_DeviceSupported_Params *pParams);
+
+/** @} */ /* END CUPTI_METRIC_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /*_CUPTI_PROFILER_TARGET_H_*/
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..1db2cea872a5fce3b537df9770f7123d3796f6d6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h
@@ -0,0 +1,481 @@
+
+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+
+typedef enum CUpti_runtime_api_trace_cbid_enum {
+    CUPTI_RUNTIME_TRACE_CBID_INVALID                                                       = 0,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020                                    = 1,
+    CUPTI_RUNTIME_TRACE_CBID_cudaRuntimeGetVersion_v3020                                   = 2,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020                                      = 3,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020                                 = 4,
+    CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020                                        = 5,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetChannelDesc_v3020                                      = 6,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateChannelDesc_v3020                                   = 7,
+    CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020                                       = 8,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020                                       = 9,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020                                        = 10,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020                                     = 11,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorString_v3020                                      = 12,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020                                              = 13,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetCacheConfig_v3020                                  = 14,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetAttributes_v3020                                   = 15,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020                                           = 16,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020                                           = 17,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetValidDevices_v3020                                     = 18,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDeviceFlags_v3020                                      = 19,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020                                              = 20,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocPitch_v3020                                         = 21,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020                                                = 22,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocArray_v3020                                         = 23,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeArray_v3020                                           = 24,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocHost_v3020                                          = 25,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeHost_v3020                                            = 26,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostAlloc_v3020                                           = 27,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetDevicePointer_v3020                                = 28,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetFlags_v3020                                        = 29,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020                                          = 30,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020                                              = 31,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_v3020                                            = 32,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_v3020                                       = 33,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_v3020                                     = 34,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_v3020                                     = 35,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_v3020                                   = 36,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_v3020                                  = 37,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_v3020                                = 38,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_v3020                                      = 39,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_v3020                                    = 40,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020                                         = 41,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_v3020                                  = 42,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_v3020                                = 43,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_v3020                                       = 44,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_v3020                                = 45,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_v3020                              = 46,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_v3020                                 = 47,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_v3020                               = 48,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020                                              = 49,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_v3020                                            = 50,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020                                         = 51,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020                                       = 52,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolAddress_v3020                                    = 53,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolSize_v3020                                       = 54,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture_v3020                                         = 55,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture2D_v3020                                       = 56,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToArray_v3020                                  = 57,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUnbindTexture_v3020                                       = 58,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureAlignmentOffset_v3020                           = 59,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureReference_v3020                                 = 60,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindSurfaceToArray_v3020                                  = 61,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceReference_v3020                                 = 62,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetGLDevice_v3020                                       = 63,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLRegisterBufferObject_v3020                              = 64,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObject_v3020                                   = 65,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObject_v3020                                 = 66,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnregisterBufferObject_v3020                            = 67,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetBufferObjectMapFlags_v3020                           = 68,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObjectAsync_v3020                              = 69,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObjectAsync_v3020                            = 70,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWGLGetDevice_v3020                                        = 71,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterImage_v3020                             = 72,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterBuffer_v3020                            = 73,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnregisterResource_v3020                          = 74,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceSetMapFlags_v3020                         = 75,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsMapResources_v3020                                = 76,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnmapResources_v3020                              = 77,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedPointer_v3020                    = 78,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsSubResourceGetMappedArray_v3020                   = 79,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUGetDevice_v3020                                      = 80,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUSetVDPAUDevice_v3020                                 = 81,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterVideoSurface_v3020                   = 82,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterOutputSurface_v3020                  = 83,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevice_v3020                                      = 84,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevices_v3020                                     = 85,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11SetDirect3DDevice_v3020                              = 86,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D11RegisterResource_v3020                       = 87,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevice_v3020                                      = 88,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevices_v3020                                     = 89,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10SetDirect3DDevice_v3020                              = 90,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D10RegisterResource_v3020                       = 91,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10RegisterResource_v3020                               = 92,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnregisterResource_v3020                             = 93,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10MapResources_v3020                                   = 94,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnmapResources_v3020                                 = 95,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceSetMapFlags_v3020                            = 96,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetSurfaceDimensions_v3020                   = 97,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedArray_v3020                         = 98,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPointer_v3020                       = 99,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedSize_v3020                          = 100,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPitch_v3020                         = 101,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevice_v3020                                       = 102,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevices_v3020                                      = 103,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9SetDirect3DDevice_v3020                               = 104,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDirect3DDevice_v3020                               = 105,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D9RegisterResource_v3020                        = 106,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterResource_v3020                                = 107,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterResource_v3020                              = 108,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapResources_v3020                                    = 109,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapResources_v3020                                  = 110,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceSetMapFlags_v3020                             = 111,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetSurfaceDimensions_v3020                    = 112,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedArray_v3020                          = 113,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPointer_v3020                        = 114,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedSize_v3020                           = 115,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPitch_v3020                          = 116,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9Begin_v3020                                           = 117,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9End_v3020                                             = 118,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterVertexBuffer_v3020                            = 119,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterVertexBuffer_v3020                          = 120,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapVertexBuffer_v3020                                 = 121,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapVertexBuffer_v3020                               = 122,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExit_v3020                                          = 123,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForDevice_v3020                                  = 124,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForHost_v3020                                    = 125,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020                                   = 126,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetLimit_v3020                                      = 127,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetLimit_v3020                                      = 128,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreate_v3020                                        = 129,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v3020                                       = 130,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020                                   = 131,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_v3020                                         = 132,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020                                         = 133,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020                                = 134,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020                                         = 135,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020                                        = 136,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020                                    = 137,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020                                          = 138,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v3020                                    = 139,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3D_v3020                                            = 140,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3DArray_v3020                                       = 141,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_v3020                                            = 142,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_v3020                                       = 143,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_v3020                                            = 144,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_v3020                                       = 145,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetCacheConfig_v3020                                = 146,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020                                     = 147,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDirect3DDevice_v3020                              = 148,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDirect3DDevice_v3020                              = 149,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetCacheConfig_v3020                                = 150,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPointerGetAttributes_v4000                                = 151,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostRegister_v4000                                        = 152,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostUnregister_v4000                                      = 153,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceCanAccessPeer_v4000                                 = 154,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceEnablePeerAccess_v4000                              = 155,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceDisablePeerAccess_v4000                             = 156,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerRegister_v4000                                        = 157,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerUnregister_v4000                                      = 158,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerGetDevicePointer_v4000                                = 159,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeer_v4000                                          = 160,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeerAsync_v4000                                     = 161,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_v4000                                        = 162,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_v4000                                   = 163,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020                                         = 164,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020                                   = 165,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetLimit_v3020                                      = 166,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetLimit_v3020                                      = 167,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetCacheConfig_v3020                                = 168,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetCacheConfig_v3020                                = 169,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerInitialize_v4000                                  = 170,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStart_v4000                                       = 171,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStop_v4000                                        = 172,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetByPCIBusId_v4010                                 = 173,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetPCIBusId_v4010                                   = 174,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLGetDevices_v4010                                        = 175,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetEventHandle_v4010                                   = 176,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenEventHandle_v4010                                  = 177,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetMemHandle_v4010                                     = 178,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenMemHandle_v4010                                    = 179,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcCloseMemHandle_v4010                                   = 180,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetInfo_v4010                                        = 181,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetSharedMemConfig_v4020                              = 182,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetSharedMemConfig_v4020                            = 183,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetSharedMemConfig_v4020                            = 184,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v5000                                 = 185,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyTextureObject_v5000                                = 186,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceDesc_v5000                        = 187,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v5000                         = 188,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateSurfaceObject_v5000                                 = 189,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroySurfaceObject_v5000                                = 190,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceObjectResourceDesc_v5000                        = 191,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocMipmappedArray_v5000                                = 192,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetMipmappedArrayLevel_v5000                              = 193,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeMipmappedArray_v5000                                  = 194,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToMipmappedArray_v5000                         = 195,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedMipmappedArray_v5000             = 196,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_v5000                                   = 197,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithFlags_v5000                               = 198,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceViewDesc_v5000                    = 199,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000                                  = 200,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v5050                                       = 201,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithPriority_v5050                            = 202,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_v5050                                   = 203,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_v5050                                      = 204,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetStreamPriorityRange_v5050                        = 205,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocManaged_v6000                                       = 206,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000           = 207,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_v6000                                = 208,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorName_v6050                                        = 209,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050           = 210,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000                                        = 211,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceFlags_v7000                                      = 212,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000                                         = 213,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000                                   = 214,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_ptds_v7000                                         = 215,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_ptds_v7000                                       = 216,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_ptds_v7000                                  = 217,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_ptds_v7000                                = 218,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_ptds_v7000                                = 219,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_ptds_v7000                              = 220,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_ptds_v7000                             = 221,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_ptds_v7000                           = 222,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_ptds_v7000                                 = 223,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_ptds_v7000                               = 224,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_ptsz_v7000                                    = 225,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_ptsz_v7000                             = 226,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_ptsz_v7000                           = 227,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_ptsz_v7000                                  = 228,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_ptsz_v7000                           = 229,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_ptsz_v7000                         = 230,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_ptsz_v7000                            = 231,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_ptsz_v7000                          = 232,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_ptds_v7000                                         = 233,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_ptds_v7000                                       = 234,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_ptsz_v7000                                    = 235,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_ptsz_v7000                                  = 236,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_ptsz_v7000                              = 237,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_ptsz_v7000                                 = 238,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_ptsz_v7000                              = 239,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_ptsz_v7000                                    = 240,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_ptsz_v7000                           = 241,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_ptsz_v7000                                    = 242,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_ptds_v7000                                       = 243,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_ptsz_v7000                                  = 244,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_ptds_v7000                                       = 245,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_ptsz_v7000                                  = 246,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_ptsz_v7000                                = 247,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_ptsz_v7000                              = 248,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_ptds_v7000                                   = 249,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_ptsz_v7000                              = 250,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000  = 251,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v8000                                    = 252,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_ptsz_v8000                               = 253,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v8000                                           = 254,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetP2PAttribute_v8000                               = 255,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsEGLRegisterImage_v7000                            = 256,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnect_v7000                            = 257,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerDisconnect_v7000                         = 258,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerAcquireFrame_v7000                       = 259,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerReleaseFrame_v7000                       = 260,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerConnect_v7000                            = 261,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerDisconnect_v7000                         = 262,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerPresentFrame_v7000                       = 263,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerReturnFrame_v7000                        = 264,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedEglFrame_v7000                   = 265,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttribute_v8000                                = 266,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttributes_v8000                               = 267,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnectWithFlags_v7000                   = 268,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000                             = 269,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000                        = 270,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateFromEGLSync_v9000                              = 271,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000                  = 272,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetAttribute_v9000                                    = 273,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalMemory_v10000                               = 274,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedBuffer_v10000                      = 275,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedMipmappedArray_v10000              = 276,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalMemory_v10000                              = 277,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalSemaphore_v10000                            = 278,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v10000                      = 279,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_ptsz_v10000                 = 280,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v10000                        = 281,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_ptsz_v10000                   = 282,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalSemaphore_v10000                           = 283,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_v10000                                     = 284,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_ptsz_v10000                                = 285,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphCreate_v10000                                        = 286,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetParams_v10000                           = 287,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetParams_v10000                           = 288,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddKernelNode_v10000                                 = 289,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode_v10000                                 = 290,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeGetParams_v10000                           = 291,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams_v10000                           = 292,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemsetNode_v10000                                 = 293,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeGetParams_v10000                           = 294,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeSetParams_v10000                           = 295,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddHostNode_v10000                                   = 296,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeGetParams_v10000                             = 297,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddChildGraphNode_v10000                             = 298,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphChildGraphNodeGetGraph_v10000                        = 299,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEmptyNode_v10000                                  = 300,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphClone_v10000                                         = 301,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeFindInClone_v10000                               = 302,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetType_v10000                                   = 303,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetRootNodes_v10000                                  = 304,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v10000                           = 305,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v10000                         = 306,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v10000                               = 307,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v10000                            = 308,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroyNode_v10000                                   = 309,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v10000                                   = 310,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000                                        = 311,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000                                   = 312,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecDestroy_v10000                                   = 313,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroy_v10000                                       = 314,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_v10000                                 = 315,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_ptsz_v10000                            = 316,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_v10000                                  = 317,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_ptsz_v10000                             = 318,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_v10000                                   = 319,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_ptsz_v10000                              = 320,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeSetParams_v10000                             = 321,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetNodes_v10000                                      = 322,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v10000                                      = 323,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v10010                               = 324,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_ptsz_v10010                          = 325,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecKernelNodeSetParams_v10010                       = 326,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExchangeStreamCaptureMode_v10010                    = 327,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetNvSciSyncAttributes_v10020                       = 328,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyAvailableDynamicSMemPerBlock_v10200              = 329,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_v10200                                     = 330,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_ptsz_v10200                                = 331,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams_v10020                       = 332,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemsetNodeSetParams_v10020                       = 333,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecHostNodeSetParams_v10020                         = 334,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecUpdate_v10020                                    = 335,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetFuncBySymbol_v11000                                    = 336,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCtxResetPersistingL2Cache_v11000                          = 337,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeCopyAttributes_v11000                      = 338,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetAttribute_v11000                        = 339,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetAttribute_v11000                        = 340,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_v11000                               = 341,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_ptsz_v11000                          = 342,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_v11000                                 = 343,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_ptsz_v11000                            = 344,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_v11000                                 = 345,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000                            = 346,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetTexture1DLinearMaxWidth_v11010                   = 347,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_v10000                                        = 348,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_ptsz_v10000                                   = 349,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeToSymbol_v11010                         = 350,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeFromSymbol_v11010                       = 351,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode1D_v11010                               = 352,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsToSymbol_v11010                   = 353,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsFromSymbol_v11010                 = 354,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams1D_v11010                         = 355,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010               = 356,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010             = 357,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams1D_v11010                     = 358,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetSparseProperties_v11010                           = 359,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetSparseProperties_v11010                  = 360,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecChildGraphNodeSetParams_v11010                   = 361,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventRecordNode_v11010                            = 362,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeGetEvent_v11010                       = 363,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeSetEvent_v11010                       = 364,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventWaitNode_v11010                              = 365,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeGetEvent_v11010                         = 366,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeSetEvent_v11010                         = 367,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventRecordNodeSetEvent_v11010                   = 368,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventWaitNodeSetEvent_v11010                     = 369,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_v11010                               = 370,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_ptsz_v11010                          = 371,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetDefaultMemPool_v11020                            = 372,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_v11020                                        = 373,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_ptsz_v11020                                   = 374,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_v11020                                          = 375,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_ptsz_v11020                                     = 376,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolTrimTo_v11020                                      = 377,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAttribute_v11020                                = 378,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAttribute_v11020                                = 379,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAccess_v11020                                   = 380,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetPlane_v11020                                      = 381,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAccess_v11020                                   = 382,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolCreate_v11020                                      = 383,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolDestroy_v11020                                     = 384,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetMemPool_v11020                                   = 385,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetMemPool_v11020                                   = 386,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportToShareableHandle_v11020                     = 387,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportFromShareableHandle_v11020                   = 388,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportPointer_v11020                               = 389,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportPointer_v11020                               = 390,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_v11020                                = 391,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_ptsz_v11020                           = 392,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_v11020                   = 393,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020              = 394,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_v11020                     = 395,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020                = 396,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresSignalNode_v11020               = 397,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeGetParams_v11020         = 398,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeSetParams_v11020         = 399,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresWaitNode_v11020                 = 400,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeGetParams_v11020           = 401,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeSetParams_v11020           = 402,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020     = 403,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020       = 404,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceFlushGPUDirectRDMAWrites_v11030                     = 405,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_v11030                                = 406,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_ptsz_v11030                           = 407,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDebugDotPrint_v11030                                 = 408,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_v11030                            = 409,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_ptsz_v11030                       = 410,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v11030                    = 411,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_ptsz_v11030               = 412,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectCreate_v11030                                   = 413,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRetain_v11030                                   = 414,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRelease_v11030                                  = 415,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRetainUserObject_v11030                              = 416,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphReleaseUserObject_v11030                             = 417,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithFlags_v11040                          = 418,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemAllocNode_v11040                               = 419,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemAllocNodeGetParams_v11040                         = 420,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemFreeNode_v11040                                = 421,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemFreeNodeGetParams_v11040                          = 422,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGraphMemTrim_v11040                                 = 423,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetGraphMemAttribute_v11040                         = 424,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetGraphMemAttribute_v11040                         = 425,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetEnabled_v11060                                = 426,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetEnabled_v11060                                = 427,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetMemoryRequirements_v11060                         = 428,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetMemoryRequirements_v11060                = 429,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060                                    = 430,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060                               = 431,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxPotentialClusterSize_v11070                   = 432,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveClusters_v11070                         = 433,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v2_v11080                             = 434,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v2_v11080                     = 435,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_v12000                         = 436,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_ptsz_v12000                    = 437,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecGetFlags_v12000                                  = 438,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetKernel_v12000                                          = 439,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v2_v12000                             = 440,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_v12000                                        = 441,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_ptsz_v12000                                   = 442,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v12000                                   = 443,
+    CUPTI_RUNTIME_TRACE_CBID_cudaInitDevice_v12000                                         = 444,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v12020                                       = 445,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetParams_v12020                                 = 446,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecNodeSetParams_v12020                             = 447,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v2_v12020                                       = 448,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_v12020                                = 449,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_ptsz_v12020                           = 450,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetName_v12030                                        = 451,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_v12030                          = 452,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_ptsz_v12030                     = 453,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphConditionalHandleCreate_v12030                       = 454,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v2_v12030                                   = 455,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v2_v12030                        = 456,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v2_v12030                      = 457,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v2_v12030                            = 458,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v2_v12030                         = 459,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v2_v12030                                    = 460,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_v12030                            = 461,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_ptsz_v12030                       = 462,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_v12030                 = 463,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030            = 464,
+    CUPTI_RUNTIME_TRACE_CBID_cuda465_v12040                                                = 465,
+    CUPTI_RUNTIME_TRACE_CBID_cuda466_v12040                                                = 466,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetParamInfo_v12040                                   = 467,
+    CUPTI_RUNTIME_TRACE_CBID_SIZE                                                          = 468,
+    CUPTI_RUNTIME_TRACE_CBID_FORCE_INT                                                     = 0x7fffffff
+} CUpti_runtime_api_trace_cbid;
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..acb59cf8e5882a5ff13b4a1b0fdc6bc7b0ec47f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2023 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_SASS_METRICS_H_)
+#define _CUPTI_SASS_METRICS_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_profiler_target.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_SASS_METRICS_API CUPTI SASS Metrics API
+ * Functions, types, and enums that implement the CUPTI SASS Metrics API.
+ * @{
+ */
+
+typedef enum
+{
+    /// SASS metric data will be collected at GPU level. 
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to 1
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_GPU = 0,
+
+    /// SASS metric data will be collected at SM level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SMs in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SM = 1,
+
+    /// SASS metric data will be collected at SM sub-partition level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SM sub-partitions in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SMSP = 2,
+
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_INVALID
+} CUpti_SassMetrics_OutputGranularity;
+
+typedef struct CUpti_SassMetrics_MetricDetails
+{
+    /// unique ID for the SASS metric
+    uint64_t metricId;
+    /// metric name
+    const char* pMetricName;
+    /// metric description
+    const char* pMetricDescription;
+} CUpti_SassMetrics_MetricDetails;
+
+/**
+ * \brief Params for cuptiSassMetricsGetNumOfMetrics
+ */
+typedef struct CUpti_SassMetrics_GetNumOfMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [out] number of metrics supported for the queried chip
+    size_t numOfMetrics;
+} CUpti_SassMetrics_GetNumOfMetrics_Params;
+
+#define CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetNumOfMetrics_Params, numOfMetrics)
+
+/**
+ * \brief Get the number of supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetNumOfMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetNumOfMetrics(CUpti_SassMetrics_GetNumOfMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetMetrics
+ */
+typedef struct CUpti_SassMetrics_GetMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [in] number of metrics supported for the queried chip (can be queried using cuptiSassMetricsGetNumOfMetrics())
+    size_t numOfMetrics;
+    /// [out] list of metrics supported for queried chip
+    CUpti_SassMetrics_MetricDetails* pMetricsList;
+} CUpti_SassMetrics_GetMetrics_Params;
+#define CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetMetrics_Params, pMetricsList)
+
+/**
+ * \brief Get the list of all supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetMetrics(CUpti_SassMetrics_GetMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetProperties
+ */
+typedef struct CUpti_SassMetrics_GetProperties_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metric will be queried
+    const char* pChipName;
+    /// [in] metric name
+    const char* pMetricName;
+    /// [out] returns the metric ID and the metric description
+    CUpti_SassMetrics_MetricDetails metric;
+} CUpti_SassMetrics_GetProperties_Params;
+#define CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE        CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetProperties_Params, metric)
+
+/**
+ * \brief Get metric properties for the queried metric.
+ * For a given metric the results will be put in CUpti_SassMetrics_MetricDetails which
+ * stores metric ID, description of the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetProperties(CUpti_SassMetrics_GetProperties_Params *pParams);
+
+typedef struct CUpti_SassMetrics_Config
+{
+    /// [in] unique id for the SASS metric, can be queried using cuptiSassMetricsGetProperties()
+    uint64_t metricId;
+    /// [in] CUpti_SassMetrics_OutputGranularity
+    uint8_t outputGranularity;
+} CUpti_SassMetrics_Config;
+
+/**
+ * \brief Params for cuptiSassMetricsSetConfig
+ */
+typedef struct CUpti_SassMetricsSetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] num of metric configs, will be equal to number of metrics queried
+    size_t numOfMetricConfig;
+    /// [in] list of metric config generated for given sass metrics
+    CUpti_SassMetrics_Config* pConfigs;
+    /// [in] device index for which config will be set, user can call this once for
+    /// the device on which the the SASS metric data will be collected
+    uint32_t deviceIndex;
+} CUpti_SassMetricsSetConfig_Params;
+#define CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsSetConfig_Params, deviceIndex)
+
+/**
+ * \brief Set config for the SASS metric data collection for a device.
+ * User need to call this API before calling any of the SASS metric data collection APIs.
+ * Each set config API call need to be followed by cuptiSassPatchingUnSetConfig API
+ * before calling the cuptiSassMetricsSetConfig() API again for the same device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling unset config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsSetConfig(CUpti_SassMetricsSetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsUnsetConfig
+ */
+typedef struct CUpti_SassMetricsUnsetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] device index for which SASS metric data collection config will get reset, user need to call this API for
+    /// all the devices on which the the SASS metric data collection have been configured.
+    uint32_t deviceIndex;
+} CUpti_SassMetricsUnsetConfig_Params;
+#define CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsUnsetConfig_Params, deviceIndex)
+
+/**
+ * \brief Unset config API will reset the SASS metric data collection configuration for the device.
+ * Once this API called CUPTI will deallocate all the memory allocated and remove all
+ * the configuration for SASS metric data collection. User can only call this API for a device where
+ * cuptiSassMetricsSetConfig() API has been called earlier for the device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling set config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsUnsetConfig(CUpti_SassMetricsUnsetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsEnable
+ */
+typedef struct CUpti_SassMetricsEnable_Params
+{
+    /// [in] equal to CUpti_SassMetricsEnable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] if false, all the functions will patched regardless of their execution with cuptiSassMetricsEnable() API call.
+    /// when this parameter is set to true, metric data collection for the function will be done at the very first execution in the enable/disble
+    /// range.
+    uint8_t enableLazyPatching;
+} CUpti_SassMetricsEnable_Params;
+#define CUpti_SassMetricsEnable_Params_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsEnable_Params, enableLazyPatching)
+
+/**
+ * \brief Sass metric data collection enable API will mark the start of a range, between which kernel
+ *  will be profiled for SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsEnable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsDisable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsEnable(CUpti_SassMetricsEnable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsDisable
+ */
+typedef struct CUpti_SassMetricsDisable_Params
+{
+    /// [in] equal to CUpti_SassMetricsDisable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be disabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] Num of dropped SASS records will be equal to numOfPatchedInstructions * numOfInstances.
+    /// Number of dropped records will be zero when data is flushed prior to calling the disable API.
+    size_t numOfDroppedRecords;
+} CUpti_SassMetricsDisable_Params;
+#define CUpti_SassMetricsDisable_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsDisable_Params, numOfDroppedRecords)
+
+/**
+ * \brief SASS metric data collection disable API will mark the end of a range, any kernel launched after this
+ * API call will not be profiled for the SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsDisable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsEnable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsDisable(CUpti_SassMetricsDisable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetDataProperties
+ */
+typedef struct CUpti_SassMetricsGetDataProperties_Params
+{
+    /// [in] equal to CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] total number of SASS records has been collected
+    size_t numOfPatchedInstructionRecords;
+    /// [out] number of instances for each metric value per instruction.
+    /// This will depend on CUpti_SassPatching_OutputGranularity level set for the metric config.
+    size_t numOfInstances;
+} CUpti_SassMetricsGetDataProperties_Params;
+
+#define CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsGetDataProperties_Params, numOfInstances)
+/**
+ * \brief SASS metric data properties API will give the data regarding number of instances of a metric
+ * value and number of SASS instruction data has been collected. The number of instances of a metric
+ * will vary as per user set the output granularity level with CUpti_SassMetrics_OutputGranularity value.
+ * User need to allocate memory for retriving the SASS data using cuptiSassMetricsFlushData() API.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsGetDataProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetDataProperties(CUpti_SassMetricsGetDataProperties_Params* pParams);
+
+typedef struct CUpti_SassMetrics_InstanceValue
+{
+    // unique id of the metric
+    uint64_t metricId;
+    // metric value 
+    uint64_t value;
+} CUpti_SassMetrics_InstanceValue;
+#define CUpti_SassMetrics_InstanceValue_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_InstanceValue, value)
+
+typedef struct CUpti_SassMetrics_Data
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [out] Unique cubin id
+    uint32_t cubinCrc;
+    /// [out] function's unique symbol index in the module.
+    uint32_t functionIndex;
+    /// [out] The function name
+    const char* functionName;
+    /// [out] pc offset for the function in a module
+    uint32_t pcOffset;
+    /// [out] array of size equal to number of instances per metric, which contains the metric ID and metric value.
+    CUpti_SassMetrics_InstanceValue* pInstanceValues;
+} CUpti_SassMetrics_Data;
+
+/**
+ * \brief Params for cuptiSassMetricsFlushData
+ */
+typedef struct CUpti_SassMetricsFlushData_Params
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] number of patched instruction record will be retrived, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of records available.
+    size_t numOfPatchedInstructionRecords;
+    /// [in] number of patched instruction record instances for a metric, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of instances for each record per metric available.
+    size_t numOfInstances;
+    /// [out] 
+    CUpti_SassMetrics_Data* pMetricsData;
+} CUpti_SassMetricsFlushData_Params;
+#define CUpti_SassMetricsFlushData_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsFlushData_Params, numOfInstances)
+
+/**
+ * \brief Flush SASS metrics data from CUPTI internal buffer to the user buffer.
+ * User needs to allocate the buffer for retrieving the data. The number of records collected
+ * can be queried using the API cuptiSassMetricsGetDataProperties().
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsFlushData_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection.
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsFlushData(CUpti_SassMetricsFlushData_Params* pParams);
+
+/** @} */ /* END CUPTI_SASS_METRICS_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // _CUPTI_SASS_METRICS_H_
\ No newline at end of file
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_atomic_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fa21ad8c1caef27fe00c315759f9379c247302c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_atomic_functions.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_H__)
+#define __DEVICE_ATOMIC_FUNCTIONS_H__
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+# define __DEVICE_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in PGI CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) __DEF_IF_HOST
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
+    "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
+#elif defined(_NVHPC_CUDA)
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on cc70 and above, and should be replaced with "#x"_sync()."
+#else
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+extern "C"
+{
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) int __any(int cond);
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) int __all(int cond);
+}
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) bool any(bool cond) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) bool all(bool cond) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "device_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#undef EXCLUDE_FROM_RTC
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_H__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..153ac712aab4288e4c16dd229460b677e7b61152
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
+#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+
+extern "C"
+{
+extern __device__ __device_builtin__ int          __iAtomicAdd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicExch(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicExch(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ float        __fAtomicExch(float *address, float val);
+extern __device__ __device_builtin__ int          __iAtomicMin(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMin(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicMax(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMax(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicInc(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicDec(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicAnd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAnd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicOr(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicOr(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicXor(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicXor(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicCAS(int *address, int compare, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicCAS(unsigned int *address, unsigned int compare, unsigned int val);
+
+
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val);
+}
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
+{
+  return __iAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
+{
+  return __iAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
+{
+  return __iAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
+{
+  return __fAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
+{
+  return __iAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
+{
+  return __iAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
+{
+  return __iAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
+{
+  return __iAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
+{
+  return __iAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
+{
+  return __iAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
+{
+  return __uAtomicCAS(address, compare, val);
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
+{
+  return __ullAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
+{
+  return (bool)__any((int)cond);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
+{
+  return (bool)__all((int)cond);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..0094cc9a0a57f53f47421a8ecc400fb84c26babe
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/device_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/device_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5bcac5ee68c0cf547e4de7c08badf37106639dc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_30_INTRINSICS_HPP__)
+#define __SM_30_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+}
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.0 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+__SM_30_INTRINSICS_DECL__
+unsigned __fns(unsigned mask, unsigned base, int offset) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset);
+  return __nvvm_fns(mask, base, offset);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync(unsigned id) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id);
+  return __nvvm_barrier_sync(id);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync_count(unsigned id, unsigned cnt) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt);
+  return __nvvm_barrier_sync_cnt(id, cnt);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __syncwarp(unsigned mask) {
+  extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask);
+  return __nvvm_bar_warp_sync(mask);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __all_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_all_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __any_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_any_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __uni_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+unsigned __ballot_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__
+unsigned __activemask() {
+    unsigned ret;
+    asm volatile ("activemask.b32 %0;" : "=r"(ret));
+    return ret;
+}
+
+// These are removed starting with compute_70 and onwards
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+
+__SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) {
+	return (unsigned int) __shfl((int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) {
+	int ret;
+	int c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_up((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_down((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor((int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+// 64-bits SHFL
+
+__SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) {
+	return (unsigned long long) __shfl((long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_up((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_down((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) {
+	return (unsigned long long) __shfl_xor((long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((long long) var, srcLane, width) :
+		__shfl((int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((unsigned long long) var, srcLane, width) :
+		__shfl((unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((long long) var, delta, width) :
+		__shfl_up((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((unsigned long long) var, delta, width) :
+		__shfl_up((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((long long) var, delta, width) :
+		__shfl_down((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((unsigned long long) var, delta, width) :
+		__shfl_down((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((long long) var, laneMask, width) :
+		__shfl_xor((int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((unsigned long long) var, laneMask, width) :
+		__shfl_xor((unsigned int) var, laneMask, width);
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) {
+        return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+        int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c);
+	return __int_as_float(ret);
+}
+
+// 64-bits SHFL
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) {
+        return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) {
+        return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+// long needs some help to choose between 32-bits and 64-bits
+
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (long long) var, srcLane, width) :
+		__shfl_sync(mask, (int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (unsigned long long) var, srcLane, width) :
+		__shfl_sync(mask, (unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (long long) var, delta, width) :
+		__shfl_up_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_up_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (long long) var, delta, width) :
+		__shfl_down_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_down_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (unsigned int) var, laneMask, width);
+}
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_30_INTRINSICS_DECL__
+
+#endif /* !__SM_30_INTRINSICS_HPP__ */
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2070bc8bbfc0c5aa58c45ef1d28623d91f4e938
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_H__)
+#define __SM_32_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_32_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d50f9cea5c4d89bc555855a8ca73d617bcfa461a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp
@@ -0,0 +1,588 @@
+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_INTRINSICS_HPP__)
+#define __SM_32_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+    // There are no intrinsics built in to the compiler for SM-3.5,
+    // all intrinsics are now implemented as inline PTX below.
+}
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.5 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+// LDG is a "load from global via texture path" command which can exhibit higher
+// bandwidth on GK110 than a regular LD.
+// Define a different pointer storage size for 64 and 32 bit
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+/******************************************************************************
+ *                                   __ldg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.nc.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.nc.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.nc.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.nc.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.nc.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.nc.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.nc.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.nc.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+
+/******************************************************************************
+ *                                   __ldcg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cg.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cg.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cg.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cg.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cg.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cg.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cg.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cg.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cg.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cg.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cg.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldca                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.ca.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.ca.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) { short2 ret; asm volatile ("ld.global.ca.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) { short4 ret; asm volatile ("ld.global.ca.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.ca.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.ca.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.ca.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.ca.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.ca.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.ca.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.ca.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.ca.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) { float ret; asm volatile ("ld.global.ca.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldcs                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cs.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cs.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cs.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cs.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cs.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cs.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cs.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cs.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cs.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cs.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cs.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldlu                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) { unsigned short ret; asm ("ld.global.lu.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) { unsigned int ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) { unsigned long long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.lu.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.lu.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) { short2 ret; asm ("ld.global.lu.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) { short4 ret; asm ("ld.global.lu.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) { int2 ret; asm ("ld.global.lu.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) { int4 ret; asm ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.lu.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.lu.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.lu.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.lu.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.lu.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.lu.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.lu.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) { uint2 ret; asm ("ld.global.lu.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) { uint4 ret; asm ("ld.global.lu.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.lu.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) { float ret; asm ("ld.global.lu.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) { double ret; asm ("ld.global.lu.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) { float2 ret; asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) { float4 ret; asm ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) { double2 ret; asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+/******************************************************************************
+ *                                   __ldcv                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) { unsigned short ret; asm ("ld.global.cv.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) { unsigned int ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) { unsigned long long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.cv.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.cv.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) { short2 ret; asm ("ld.global.cv.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) { short4 ret; asm ("ld.global.cv.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) { int2 ret; asm ("ld.global.cv.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) { int4 ret; asm ("ld.global.cv.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.cv.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.cv.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.cv.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.cv.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.cv.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.cv.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.cv.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) { uint2 ret; asm ("ld.global.cv.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) { uint4 ret; asm ("ld.global.cv.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.cv.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) { float ret; asm ("ld.global.cv.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) { double ret; asm ("ld.global.cv.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) { float2 ret; asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) { float4 ret; asm ("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) { double2 ret; asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+/******************************************************************************
+ *                                   __stwb                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) { asm ("st.global.wb.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) { asm ("st.global.wb.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) { asm ("st.global.wb.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) { asm ("st.global.wb.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) { asm ("st.global.wb.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) { asm ("st.global.wb.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) { asm ("st.global.wb.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) { asm ("st.global.wb.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) { asm ("st.global.wb.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) { asm ("st.global.wb.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) { asm ("st.global.wb.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) { asm ("st.global.wb.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) { asm ("st.global.wb.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wb.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) { asm ("st.global.wb.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) { asm ("st.global.wb.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) { asm ("st.global.wb.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) { asm ("st.global.wb.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) { asm ("st.global.wb.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stcg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) { asm ("st.global.cg.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) { asm ("st.global.cg.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) { asm ("st.global.cg.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) { asm ("st.global.cg.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) { asm ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) { asm ("st.global.cg.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) { asm ("st.global.cg.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) { asm ("st.global.cg.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) { asm ("st.global.cg.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) { asm ("st.global.cg.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) { asm ("st.global.cg.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) { asm ("st.global.cg.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) { asm ("st.global.cg.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cg.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) { asm ("st.global.cg.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) { asm ("st.global.cg.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) { asm ("st.global.cg.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) { asm ("st.global.cg.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) { asm ("st.global.cg.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stcs                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) { asm ("st.global.cs.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) { asm ("st.global.cs.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) { asm ("st.global.cs.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) { asm ("st.global.cs.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) { asm ("st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) { asm ("st.global.cs.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) { asm ("st.global.cs.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) { asm ("st.global.cs.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) { asm ("st.global.cs.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) { asm ("st.global.cs.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) { asm ("st.global.cs.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) { asm ("st.global.cs.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) { asm ("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cs.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) { asm ("st.global.cs.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) { asm ("st.global.cs.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) { asm ("st.global.cs.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) { asm ("st.global.cs.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) { asm ("st.global.cs.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stwt                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) { asm ("st.global.wt.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) { asm ("st.global.wt.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) { asm ("st.global.wt.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) { asm ("st.global.wt.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) { asm ("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) { asm ("st.global.wt.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) { asm ("st.global.wt.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) { asm ("st.global.wt.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) { asm ("st.global.wt.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) { asm ("st.global.wt.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) { asm ("st.global.wt.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) { asm ("st.global.wt.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) { asm ("st.global.wt.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wt.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) { asm ("st.global.wt.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) { asm ("st.global.wt.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) { asm ("st.global.wt.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) { asm ("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) { asm ("st.global.wt.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+#undef __LDG_PTR
+
+
+// SHF is the "funnel shift" operation - an accelerated left/right shift with carry
+// operating on 64-bit quantities, which are concatenations of two 32-bit registers.
+
+// This shifts [b:a] left by "shift" bits, returning the most significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+
+// This shifts [b:a] right by "shift" bits, returning the least significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_INTRINSICS_DECL__
+
+#endif /* !__SM_32_INTRINSICS_HPP__ */
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8961079aeac4c9e73a7c2825cf9ea10b171af09
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_35_ATOMIC_FUNCTIONS_H__)
+#define __SM_35_ATOMIC_FUNCTIONS_H__
+
+/*******************************************************************************
+* All sm_35 atomics are supported by sm_32 so simply include its header file   *
+*******************************************************************************/
+#include "sm_32_atomic_functions.h"
+
+#endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..53d607cae6467244b2f99bd891632b8679828b54
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_H__)
+#define __SM_60_ATOMIC_FUNCTIONS_H__
+
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+#undef __DEF_IF_HOST
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_60_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__)  */
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d6ac004cd92d3af9281143123289bc2353dd494
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp
@@ -0,0 +1,742 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_60_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+extern "C"
+{
+extern __device__ __device_builtin__ double __dAtomicAdd(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_system(float *address, float val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_block(double *address, double val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_system(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_system(float *address, float val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_block(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_system(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_block(unsigned int *address, unsigned int compare,
+                                unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_system(unsigned int *address, unsigned int compare,
+                                 unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_block(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_system(unsigned long long int *address,
+                                         unsigned long long int compare,
+                                         unsigned long long int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_system(unsigned long long *address, unsigned long long val);
+}
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
+{
+  return __dAtomicAdd(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val)
+{
+  return __fAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val)
+{
+  return __fAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val)
+{
+  return __dAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val)
+{
+  return __dAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val)
+{
+  return __iAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val)
+{
+  return __iAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val)
+{
+  return __fAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val)
+{
+  return __fAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val)
+{
+  return __iAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val)
+{
+  return __iAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val)
+{
+  return __illAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val)
+{
+  return __illAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val)
+{
+  return __iAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val)
+{
+  return __iAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val)
+{
+  return __illAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val)
+{
+  return __illAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val)
+{
+  return __iAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val)
+{
+  return __iAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val)
+{
+  return __uAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val)
+{
+  return __uAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val)
+{
+  return __ullAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val)
+{
+  return __ullAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val)
+{
+  return __iAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val)
+{
+  return __iAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val)
+{
+  return __llAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val)
+{
+  return __llAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val)
+{
+  return __iAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val)
+{
+  return __iAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val)
+{
+  return __llAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val)
+{
+  return __llAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val)
+{
+  return __iAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val)
+{
+  return __iAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val)
+{
+  return __llAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val)
+{
+  return __llAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a561384b08a65445eed86bfc96a0694e5b9190c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_61_INTRINSICS_HPP__)
+#define __SM_61_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-6.1 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+// 4a
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+// 2a.lo
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+// 2a.hi
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_61_INTRINSICS_DECL__
+
+#endif /* !__SM_61_INTRINSICS_HPP__ */
+
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/surface_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/surface_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fb940c1d2bd5ee7b4a5020e12297bc2927e0386
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/surface_functions.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_FUNCTIONS_H__)
+#define __SURFACE_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_surface_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+template <typename T> struct __nv_surf_trait {  typedef void * cast_type; };
+
+template<> struct __nv_surf_trait<char> {  typedef char * cast_type; };
+template<> struct __nv_surf_trait<signed char> {  typedef signed char * cast_type; };
+template<> struct __nv_surf_trait<unsigned char> {  typedef unsigned char * cast_type; };
+template<> struct __nv_surf_trait<char1> {  typedef char1 * cast_type; };
+template<> struct __nv_surf_trait<uchar1> {  typedef uchar1 * cast_type; };
+template<> struct __nv_surf_trait<char2> {  typedef char2 * cast_type; };
+template<> struct __nv_surf_trait<uchar2> {  typedef uchar2 * cast_type; };
+template<> struct __nv_surf_trait<char4> {  typedef char4 * cast_type; };
+template<> struct __nv_surf_trait<uchar4> {  typedef uchar4 * cast_type; };
+template<> struct __nv_surf_trait<short> {  typedef short * cast_type; };
+template<> struct __nv_surf_trait<unsigned short> {  typedef unsigned short * cast_type; };
+template<> struct __nv_surf_trait<short1> {  typedef short1 * cast_type; };
+template<> struct __nv_surf_trait<ushort1> {  typedef ushort1 * cast_type; };
+template<> struct __nv_surf_trait<short2> {  typedef short2 * cast_type; };
+template<> struct __nv_surf_trait<ushort2> {  typedef ushort2 * cast_type; };
+template<> struct __nv_surf_trait<short4> {  typedef short4 * cast_type; };
+template<> struct __nv_surf_trait<ushort4> {  typedef ushort4 * cast_type; };
+template<> struct __nv_surf_trait<int> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned int> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<int1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<uint1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<int2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<uint2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<int4> {  typedef int4 * cast_type; };
+template<> struct __nv_surf_trait<uint4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<long long> {  typedef long long * cast_type; };
+template<> struct __nv_surf_trait<unsigned long long> {  typedef unsigned long long * cast_type; };
+template<> struct __nv_surf_trait<longlong1> {  typedef longlong1 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong1> {  typedef ulonglong1 * cast_type; };
+template<> struct __nv_surf_trait<longlong2> {  typedef longlong2 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong2> {  typedef ulonglong2 * cast_type; };
+#if !defined(__LP64__)
+template<> struct __nv_surf_trait<long> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned long> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<long1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<ulong1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<long2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<ulong2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<long4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<ulong4> {  typedef int4 * cast_type; };
+#endif
+template<> struct __nv_surf_trait<float> {  typedef float * cast_type; };
+template<> struct __nv_surf_trait<float1> {  typedef float1 * cast_type; };
+template<> struct __nv_surf_trait<float2> {  typedef float2 * cast_type; };
+template<> struct __nv_surf_trait<float4> {  typedef float4 * cast_type; };
+
+
+#undef __DEPRECATED__
+
+
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__SURFACE_FUNCTIONS_H__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/surface_types.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d0eccdee2a80132c5dadfcd80643c1b41eb8ec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/surface_types.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_TYPES_H__)
+#define __SURFACE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+#ifndef __CUDACC_RTC_MINIMAL__
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaSurfaceType1D              0x01
+#define cudaSurfaceType2D              0x02
+#define cudaSurfaceType3D              0x03
+#define cudaSurfaceTypeCubemap         0x0C
+#define cudaSurfaceType1DLayered       0xF1
+#define cudaSurfaceType2DLayered       0xF2
+#define cudaSurfaceTypeCubemapLayered  0xFC
+
+/**
+ * CUDA Surface boundary modes
+ */
+enum __device_builtin__ cudaSurfaceBoundaryMode
+{
+    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
+    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
+    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
+};
+
+/**
+ * CUDA Surface format modes
+ */
+enum __device_builtin__  cudaSurfaceFormatMode
+{
+    cudaFormatModeForced = 0,     /**< Forced format mode */
+    cudaFormatModeAuto = 1        /**< Auto format mode */
+};
+
+/**
+ * An opaque value that represents a CUDA Surface object
+ */
+typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#endif /* !__SURFACE_TYPES_H__ */
diff --git a/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e5537d87294ee78ecec567893a6aaec333db317
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h
@@ -0,0 +1,638 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+#ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__
+#define __TEXTURE_INDIRECT_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+
+#include "cuda_runtime_api.h"
+
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+#define __NV_TEX_SPARSE 1
+#endif  /* endif */
+
+template <typename T> struct __nv_itex_trait {   };
+template<> struct __nv_itex_trait<char> { typedef void type; };
+template<> struct __nv_itex_trait<signed char> { typedef void type; };
+template<> struct __nv_itex_trait<char1> { typedef void type; };
+template<> struct __nv_itex_trait<char2> { typedef void type; };
+template<> struct __nv_itex_trait<char4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned char> { typedef void type; };
+template<> struct __nv_itex_trait<uchar1> { typedef void type; };
+template<> struct __nv_itex_trait<uchar2> { typedef void type; };
+template<> struct __nv_itex_trait<uchar4> { typedef void type; };
+template<> struct __nv_itex_trait<short> { typedef void type; };
+template<> struct __nv_itex_trait<short1> { typedef void type; };
+template<> struct __nv_itex_trait<short2> { typedef void type; };
+template<> struct __nv_itex_trait<short4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned short> { typedef void type; };
+template<> struct __nv_itex_trait<ushort1> { typedef void type; };
+template<> struct __nv_itex_trait<ushort2> { typedef void type; };
+template<> struct __nv_itex_trait<ushort4> { typedef void type; };
+template<> struct __nv_itex_trait<int> { typedef void type; };
+template<> struct __nv_itex_trait<int1> { typedef void type; };
+template<> struct __nv_itex_trait<int2> { typedef void type; };
+template<> struct __nv_itex_trait<int4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned int> { typedef void type; };
+template<> struct __nv_itex_trait<uint1> { typedef void type; };
+template<> struct __nv_itex_trait<uint2> { typedef void type; };
+template<> struct __nv_itex_trait<uint4> { typedef void type; };
+#if !defined(__LP64__)
+template<> struct __nv_itex_trait<long> { typedef void type; };
+template<> struct __nv_itex_trait<long1> { typedef void type; };
+template<> struct __nv_itex_trait<long2> { typedef void type; };
+template<> struct __nv_itex_trait<long4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned long> { typedef void type; };
+template<> struct __nv_itex_trait<ulong1> { typedef void type; };
+template<> struct __nv_itex_trait<ulong2> { typedef void type; };
+template<> struct __nv_itex_trait<ulong4> { typedef void type; };
+#endif /* !__LP64__ */
+template<> struct __nv_itex_trait<float> { typedef void type; };
+template<> struct __nv_itex_trait<float1> { typedef void type; };
+template<> struct __nv_itex_trait<float2> { typedef void type; };
+template<> struct __nv_itex_trait<float4> { typedef void type; };
+
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x)
+{
+   __nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x);
+}
+
+template <class T>
+static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x)
+{
+  T ret;
+  tex1Dfetch(&ret, texObject, x);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x)
+{
+   __nv_tex_surf_handler("__itex1D", ptr, obj, x);
+}
+
+
+template <class T>
+static __device__  T tex1D(cudaTextureObject_t texObject, float x)
+{
+  T ret;
+  tex1D(&ret, texObject, x);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y)
+{
+   __nv_tex_surf_handler("__itex2D", ptr, obj, x, y);
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y)
+{
+  T ret;
+  tex2D(&ret, texObject, x, y);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y, 
+                                                          bool* isResident)
+{
+  unsigned char res;
+   __nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res);
+   *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident)
+{
+  T ret;
+  tex2D(&ret, texObject, x, y, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+   __nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z);
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z)
+{
+  T ret;
+  tex3D(&ret, texObject, x, y, z);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z, 
+                                                          bool* isResident)
+{
+  unsigned char res;
+   __nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res);
+   *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident)
+{
+  T ret;
+  tex3D(&ret, texObject, x, y, z, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer)
+{
+   __nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer);
+}
+
+template <class T>
+static __device__  T tex1DLayered(cudaTextureObject_t texObject, float x, int layer)
+{
+  T ret;
+  tex1DLayered(&ret, texObject, x, layer);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer)
+{
+  __nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer);
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer)
+{
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident)
+{
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+  __nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z);
+}
+
+
+template <class T>
+static __device__  T texCubemap(cudaTextureObject_t texObject, float x, float y, float z)
+{
+  T ret;
+  texCubemap(&ret, texObject, x, y, z);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer);
+}
+
+template <class T>
+static __device__  T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer)
+{
+  T ret;
+  texCubemapLayered(&ret, texObject, x, y, z, layer);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0)
+{
+  __nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp);
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0)
+{
+  T ret;
+  tex2Dgather(&ret, to, x, y, comp);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp,  &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0)
+{
+  T ret;
+  tex2Dgather(&ret, to, x, y,  isResident, comp);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level)
+{
+  __nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level);
+}
+
+template <class T>
+static __device__  T tex1DLod(cudaTextureObject_t texObject, float x, float level)
+{
+  T ret;
+  tex1DLod(&ret, texObject, x, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level)
+{
+  __nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level);
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level)
+{
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident)
+{
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+  __nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level);
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident)
+{
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level);
+}
+
+template <class T>
+static __device__  T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level)
+{
+  T ret;
+  tex1DLayeredLod(&ret, texObject, x, layer, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level);
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level)
+{
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident)
+{
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level);
+}
+
+template <class T>
+static __device__  T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+  T ret;
+  texCubemapLod(&ret, texObject, x, y, z, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+  __nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level);
+}
+
+template <class T>
+static __device__  T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level)
+{
+  T ret;
+  texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy)
+{
+  __nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy);
+}
+
+template <class T>
+static __device__  T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy)
+{
+  T ret;
+  tex1DGrad(&ret, texObject, x, dPdx, dPdy);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy)
+{ 
+  __nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy);
+}
+
+template <class T>
+static __device__  T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy)
+{
+  T ret;
+  tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+  __nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy);
+  return ret;
+}
+
+#undef __NV_TEX_SPARSE
+
+#endif // __cplusplus && __CUDACC__
+#endif // __TEXTURE_INDIRECT_FUNCTIONS_H__