Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py +262 -0
- .venv/lib/python3.11/site-packages/triton/backends/amd/driver.c +211 -0
- .venv/lib/python3.11/site-packages/triton/backends/amd/driver.py +497 -0
- .venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h +75 -0
- .venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h +194 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h +98 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h +100 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h +1083 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h +588 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h +1730 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h +452 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h +95 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h +212 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h +693 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h +135 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h +419 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h +320 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h +62 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h +63 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h +63 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/common_functions.h +310 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h +64 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h +1192 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp +197 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.h +0 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp +1197 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/func_macro.h +57 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_config.h +310 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_defines.h +280 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_runtime.h +306 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.h +0 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp +0 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.h +754 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.hpp +1128 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/nvfunctional +621 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h +139 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp +192 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h +164 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp +148 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h +282 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp +248 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/storage_class.h +142 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGL.h +608 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h +123 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaTypedefs.h +0 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h +514 -0
- .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h +0 -0
.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from triton.backends.compiler import BaseBackend, GPUTarget
|
| 2 |
+
from triton._C.libtriton import ir, passes, llvm, amd
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Any, Tuple
|
| 5 |
+
import hashlib
|
| 6 |
+
import tempfile
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import subprocess
|
| 10 |
+
import functools
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class HIPOptions:
|
| 16 |
+
num_warps: int = 4
|
| 17 |
+
waves_per_eu: int = 1
|
| 18 |
+
num_stages: int = 0
|
| 19 |
+
num_ctas: int = 1
|
| 20 |
+
extern_libs: dict = None
|
| 21 |
+
cluster_dims: tuple = (1, 1, 1)
|
| 22 |
+
debug: bool = False
|
| 23 |
+
arch: str = None
|
| 24 |
+
allow_fp8e4nv: bool = False
|
| 25 |
+
allow_fp8e4b15: bool = False
|
| 26 |
+
default_dot_input_precision: str = "ieee"
|
| 27 |
+
allowed_dot_input_precisions: Tuple[str] = ("ieee", )
|
| 28 |
+
enable_fp_fusion: bool = True
|
| 29 |
+
matrix_instr_nonkdim: int = 0
|
| 30 |
+
kpack: int = 1
|
| 31 |
+
allow_flush_denorm: bool = False
|
| 32 |
+
max_num_imprecise_acc_default: int = 0
|
| 33 |
+
backend_name: str = 'hip'
|
| 34 |
+
|
| 35 |
+
def __post_init__(self):
|
| 36 |
+
default_libdir = Path(__file__).parent / 'lib'
|
| 37 |
+
extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
|
| 38 |
+
# Ignore user-defined warp size for gfx9
|
| 39 |
+
warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch else 64
|
| 40 |
+
object.__setattr__(self, 'warp_size', warp_size)
|
| 41 |
+
libs = ["ocml", "ockl"]
|
| 42 |
+
for lib in libs:
|
| 43 |
+
extern_libs[lib] = str(default_libdir / f'{lib}.bc')
|
| 44 |
+
object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
|
| 45 |
+
assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
|
| 46 |
+
"num_warps must be a power of 2"
|
| 47 |
+
|
| 48 |
+
def hash(self):
|
| 49 |
+
key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
|
| 50 |
+
return hashlib.sha256(key.encode("utf-8")).hexdigest()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class HIPBackend(BaseBackend):
|
| 54 |
+
|
| 55 |
+
@staticmethod
|
| 56 |
+
def supports_target(target: GPUTarget):
|
| 57 |
+
return target.backend == 'hip'
|
| 58 |
+
|
| 59 |
+
def __init__(self, target: GPUTarget) -> None:
|
| 60 |
+
super().__init__(target)
|
| 61 |
+
assert isinstance(target.arch, str)
|
| 62 |
+
self.binary_ext = "hsaco"
|
| 63 |
+
|
| 64 |
+
def parse_options(self, opts) -> Any:
|
| 65 |
+
args = {'arch': self.target.arch}
|
| 66 |
+
args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts})
|
| 67 |
+
return HIPOptions(**args)
|
| 68 |
+
|
| 69 |
+
def pack_metadata(self, metadata):
|
| 70 |
+
return (
|
| 71 |
+
metadata.num_warps,
|
| 72 |
+
metadata.num_ctas,
|
| 73 |
+
metadata.shared,
|
| 74 |
+
metadata.cluster_dims[0],
|
| 75 |
+
metadata.cluster_dims[1],
|
| 76 |
+
metadata.cluster_dims[2],
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def get_codegen_implementation(self):
|
| 80 |
+
codegen_fns = dict()
|
| 81 |
+
return codegen_fns
|
| 82 |
+
|
| 83 |
+
def load_dialects(self, ctx):
|
| 84 |
+
amd.load_dialects(ctx)
|
| 85 |
+
|
| 86 |
+
@staticmethod
|
| 87 |
+
def path_to_rocm_lld():
|
| 88 |
+
# Check env path for ld.lld
|
| 89 |
+
lld_env_path = os.getenv("TRITON_HIP_LLD_PATH")
|
| 90 |
+
if lld_env_path is not None:
|
| 91 |
+
lld = Path(lld_env_path)
|
| 92 |
+
if lld.is_file():
|
| 93 |
+
return lld
|
| 94 |
+
# Check backend for ld.lld (used for pytorch wheels)
|
| 95 |
+
lld = Path(__file__).parent / "llvm/bin/ld.lld"
|
| 96 |
+
if lld.is_file():
|
| 97 |
+
return lld
|
| 98 |
+
lld = Path("/opt/rocm/llvm/bin/ld.lld")
|
| 99 |
+
if lld.is_file():
|
| 100 |
+
return lld
|
| 101 |
+
lld = Path("/usr/bin/ld.lld")
|
| 102 |
+
if lld.is_file():
|
| 103 |
+
return lld
|
| 104 |
+
raise Exception("ROCm linker /opt/rocm/llvm/bin/ld.lld not found")
|
| 105 |
+
|
| 106 |
+
@staticmethod
|
| 107 |
+
def make_ttir(mod, metadata, options):
|
| 108 |
+
pm = ir.pass_manager(mod.context)
|
| 109 |
+
pm.enable_debug()
|
| 110 |
+
passes.common.add_inliner(pm)
|
| 111 |
+
passes.ttir.add_rewrite_tensor_pointer(pm)
|
| 112 |
+
passes.ttir.add_combine(pm)
|
| 113 |
+
passes.common.add_canonicalizer(pm)
|
| 114 |
+
passes.ttir.add_reorder_broadcast(pm)
|
| 115 |
+
passes.common.add_cse(pm)
|
| 116 |
+
passes.common.add_licm(pm)
|
| 117 |
+
passes.common.add_symbol_dce(pm)
|
| 118 |
+
pm.run(mod)
|
| 119 |
+
return mod
|
| 120 |
+
|
| 121 |
+
@staticmethod
|
| 122 |
+
def make_ttgir(mod, metadata, options):
|
| 123 |
+
pm = ir.pass_manager(mod.context)
|
| 124 |
+
pm.enable_debug()
|
| 125 |
+
passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
|
| 126 |
+
options.num_ctas)
|
| 127 |
+
pm.run(mod)
|
| 128 |
+
pm = ir.pass_manager(mod.context)
|
| 129 |
+
pm.enable_debug()
|
| 130 |
+
passes.ttgpuir.add_coalesce(pm)
|
| 131 |
+
passes.ttgpuir.add_remove_layout_conversions(pm)
|
| 132 |
+
passes.ttgpuir.add_optimize_thread_locality(pm)
|
| 133 |
+
amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
|
| 134 |
+
passes.ttgpuir.add_remove_layout_conversions(pm)
|
| 135 |
+
amd.passes.ttgpuir.add_optimize_epilogue(pm)
|
| 136 |
+
passes.ttgpuir.add_optimize_dot_operands(pm, True)
|
| 137 |
+
if options.num_stages == 0 and amd.has_matrix_core_feature(options.arch):
|
| 138 |
+
amd.passes.ttgpuir.add_stream_pipeline(pm)
|
| 139 |
+
passes.common.add_canonicalizer(pm)
|
| 140 |
+
passes.ttgpuir.add_optimize_dot_operands(pm, True)
|
| 141 |
+
passes.ttgpuir.add_remove_layout_conversions(pm)
|
| 142 |
+
passes.ttgpuir.add_reduce_data_duplication(pm)
|
| 143 |
+
if options.num_stages != 0:
|
| 144 |
+
amd.passes.ttgpuir.add_reorder_instructions(pm)
|
| 145 |
+
passes.common.add_cse(pm)
|
| 146 |
+
passes.common.add_symbol_dce(pm)
|
| 147 |
+
pm.run(mod)
|
| 148 |
+
return mod
|
| 149 |
+
|
| 150 |
+
@staticmethod
|
| 151 |
+
def make_llir(src, metadata, options):
|
| 152 |
+
mod = src
|
| 153 |
+
# TritonGPU -> LLVM-IR (MLIR)
|
| 154 |
+
pm = ir.pass_manager(mod.context)
|
| 155 |
+
pm.enable_debug()
|
| 156 |
+
amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch)
|
| 157 |
+
passes.convert.add_scf_to_cf(pm)
|
| 158 |
+
passes.convert.add_index_to_llvmir(pm)
|
| 159 |
+
|
| 160 |
+
passes.ttgpuir.add_allocate_shared_memory(pm)
|
| 161 |
+
## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
|
| 162 |
+
## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
|
| 163 |
+
## of the value of kernel arg `allow_flush_denorm`.
|
| 164 |
+
## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
|
| 165 |
+
## depends on the value of kernel arg `allow_flush_denorm`.
|
| 166 |
+
## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
|
| 167 |
+
## For now it is used as a controller for developers only.
|
| 168 |
+
__HIP_FTZ = True
|
| 169 |
+
amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
|
| 170 |
+
passes.common.add_canonicalizer(pm)
|
| 171 |
+
passes.common.add_cse(pm)
|
| 172 |
+
|
| 173 |
+
passes.convert.add_cf_to_llvmir(pm)
|
| 174 |
+
passes.convert.add_arith_to_llvmir(pm)
|
| 175 |
+
passes.common.add_canonicalizer(pm)
|
| 176 |
+
passes.common.add_cse(pm)
|
| 177 |
+
passes.common.add_symbol_dce(pm)
|
| 178 |
+
if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
|
| 179 |
+
passes.llvmir.add_di_scope(pm)
|
| 180 |
+
# This pass (`add_builtin_func_to_llvmir`) serves as a temporary workaround to address the issue of excessive basic block
|
| 181 |
+
# count caused by predicated loads/stores. In certain kernels, the addition of these blocks can cause the MLIR
|
| 182 |
+
# canonicalizer to never finish when attempting to merge blocks. The permanent solution under consideration
|
| 183 |
+
# involves using MUBUF instructions that have built-in out-of-bounds checks, which would eliminate the need
|
| 184 |
+
# for conditional branching around memory accesses.
|
| 185 |
+
amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm)
|
| 186 |
+
pm.run(mod)
|
| 187 |
+
|
| 188 |
+
# LLVM-IR (MLIR) -> LLVM-IR (LLVM)
|
| 189 |
+
llvm.init_targets()
|
| 190 |
+
context = llvm.context()
|
| 191 |
+
llvm_mod = llvm.to_module(mod, context)
|
| 192 |
+
|
| 193 |
+
# Set various control constants on the LLVM module so that device
|
| 194 |
+
# libraries can resolve references to them.
|
| 195 |
+
amd.set_isa_version(llvm_mod, options.arch)
|
| 196 |
+
amd.set_abi_version(llvm_mod, 400)
|
| 197 |
+
amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
|
| 198 |
+
amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
|
| 199 |
+
amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
|
| 200 |
+
amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
|
| 201 |
+
|
| 202 |
+
# Set kernel attributes first given this may affect later optimizations.
|
| 203 |
+
fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
|
| 204 |
+
# The public kernel should be kernel 0.
|
| 205 |
+
fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
|
| 206 |
+
fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
|
| 207 |
+
fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}")
|
| 208 |
+
denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
|
| 209 |
+
fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
|
| 210 |
+
|
| 211 |
+
if options.extern_libs:
|
| 212 |
+
paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
|
| 213 |
+
llvm.link_extern_libs(llvm_mod, paths)
|
| 214 |
+
|
| 215 |
+
llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, amd.TARGET_TRIPLE)
|
| 216 |
+
|
| 217 |
+
# Get some metadata
|
| 218 |
+
metadata["shared"] = src.get_int_attr("triton_gpu.shared")
|
| 219 |
+
|
| 220 |
+
amd.cleanup_bitcode_metadata(llvm_mod)
|
| 221 |
+
return str(llvm_mod)
|
| 222 |
+
|
| 223 |
+
@staticmethod
|
| 224 |
+
def make_amdgcn(src, metadata, options):
|
| 225 |
+
# Find kernel names (there should only be one)
|
| 226 |
+
# We get the name at the last possible step to accomodate `triton.compile`
|
| 227 |
+
# on user-provided LLVM
|
| 228 |
+
names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
|
| 229 |
+
assert len(names) == 1
|
| 230 |
+
metadata["name"] = names[0]
|
| 231 |
+
# llvm -> hsaco
|
| 232 |
+
amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', [], options.enable_fp_fusion, False)
|
| 233 |
+
if os.environ.get("AMDGCN_ENABLE_DUMP", "0") == "1":
|
| 234 |
+
print("// -----// AMDGCN Dump //----- //")
|
| 235 |
+
print(amdgcn)
|
| 236 |
+
return amdgcn
|
| 237 |
+
|
| 238 |
+
@staticmethod
|
| 239 |
+
def make_hsaco(src, metadata, options):
|
| 240 |
+
hsaco = amd.assemble_amdgcn(src, options.arch, '')
|
| 241 |
+
|
| 242 |
+
rocm_path = HIPBackend.path_to_rocm_lld()
|
| 243 |
+
with tempfile.NamedTemporaryFile() as tmp_out:
|
| 244 |
+
with tempfile.NamedTemporaryFile() as tmp_in:
|
| 245 |
+
with open(tmp_in.name, 'wb') as fd_in:
|
| 246 |
+
fd_in.write(hsaco)
|
| 247 |
+
subprocess.check_call([rocm_path, '-flavor', 'gnu', '-shared', tmp_in.name, '-o', tmp_out.name])
|
| 248 |
+
with open(tmp_out.name, 'rb') as fd_out:
|
| 249 |
+
ret = fd_out.read()
|
| 250 |
+
return ret
|
| 251 |
+
|
| 252 |
+
def add_stages(self, stages, options):
|
| 253 |
+
stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
|
| 254 |
+
stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
|
| 255 |
+
stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
|
| 256 |
+
stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
|
| 257 |
+
stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
|
| 258 |
+
|
| 259 |
+
@functools.lru_cache()
|
| 260 |
+
def hash(self):
|
| 261 |
+
version = subprocess.check_output([HIPBackend.path_to_rocm_lld(), "--version"], encoding='utf-8')
|
| 262 |
+
return f'{version}-{self.target}'
|
.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#define __HIP_PLATFORM_AMD__
|
| 2 |
+
// clang-format off
|
| 3 |
+
// hip_depreated.h needs definitions from hip_runtime.h.
|
| 4 |
+
#include <hip/hip_runtime.h>
|
| 5 |
+
#include <hip/hip_deprecated.h>
|
| 6 |
+
// clang-format on
|
| 7 |
+
#define PY_SSIZE_T_CLEAN
|
| 8 |
+
#include <Python.h>
|
| 9 |
+
#include <dlfcn.h>
|
| 10 |
+
#include <stdio.h>
|
| 11 |
+
#include <stdlib.h>
|
| 12 |
+
|
| 13 |
+
// The list of paths to search for the HIP runtime library. The caller Python
|
| 14 |
+
// code should substitute the search path placeholder.
|
| 15 |
+
static const char *hipLibSearchPaths[] = {"/*py_libhip_search_path*/"};
|
| 16 |
+
|
| 17 |
+
// The list of HIP dynamic library symbols and their signature we are interested
|
| 18 |
+
// in this file.
|
| 19 |
+
// |FOR_EACH_ERR_FN| is a macro to process APIs that return hipError_t;
|
| 20 |
+
// |FOR_EACH_STR_FN| is a macro to process APIs that return const char *.
|
| 21 |
+
//
|
| 22 |
+
// HIP 6.0 introduced an updated hipGetDeviceProperties API under a new symbol,
|
| 23 |
+
// hipGetDevicePropertiesR0600. However, the associated hipDeviceProp_t was
|
| 24 |
+
// directly updated with breaking changes to match hipGetDevicePropertiesR0600
|
| 25 |
+
// in the header file. We include the header file from HIP 6.0. So here if we
|
| 26 |
+
// use hipGetDeviceProperties together with hipDeviceProp_t we will use the
|
| 27 |
+
// old API with a new struct definition and mess up the interpretation.
|
| 28 |
+
//
|
| 29 |
+
// This is a known issue: https://github.com/ROCm/ROCm/issues/2728.
|
| 30 |
+
//
|
| 31 |
+
// For now explicitly defer to the old hipDeviceProp_t struct. This should work
|
| 32 |
+
// for both 5.x and 6.x. In the long term we need to switch to use
|
| 33 |
+
// hipGetProcAddress once available:
|
| 34 |
+
// https://github.com/ROCm/clr/commit/0479cdb3dd30ef58718cad44e424bd793c394cc0
|
| 35 |
+
#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN) \
|
| 36 |
+
FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError) \
|
| 37 |
+
FOR_EACH_ERR_FN(hipGetDeviceProperties, hipDeviceProp_tR0000 *prop, \
|
| 38 |
+
int deviceId) \
|
| 39 |
+
FOR_EACH_ERR_FN(hipModuleLoadDataEx, hipModule_t *module, const void *image, \
|
| 40 |
+
unsigned int numOptions, hipJitOption *options, \
|
| 41 |
+
void **optionValues) \
|
| 42 |
+
FOR_EACH_ERR_FN(hipModuleGetFunction, hipFunction_t *function, \
|
| 43 |
+
hipModule_t module, const char *kname) \
|
| 44 |
+
FOR_EACH_ERR_FN(hipFuncGetAttribute, int *, hipFunction_attribute attr, \
|
| 45 |
+
hipFunction_t function)
|
| 46 |
+
|
| 47 |
+
// The HIP symbol table for holding resolved dynamic library symbols.
|
| 48 |
+
struct HIPSymbolTable {
|
| 49 |
+
#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...) \
|
| 50 |
+
hipError_t (*hipSymbolName)(__VA_ARGS__);
|
| 51 |
+
#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...) \
|
| 52 |
+
const char *(*hipSymbolName)(__VA_ARGS__);
|
| 53 |
+
|
| 54 |
+
HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
|
| 55 |
+
};
|
| 56 |
+
|
| 57 |
+
static struct HIPSymbolTable hipSymbolTable;
|
| 58 |
+
|
| 59 |
+
bool initSymbolTable() {
|
| 60 |
+
// Use the HIP runtime library loaded into the existing process if it exits.
|
| 61 |
+
void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
|
| 62 |
+
if (lib) {
|
| 63 |
+
// printf("[triton] chosen loaded libamdhip64.so in the process\n");
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
// Otherwise, go through the list of search paths to dlopen the first HIP
|
| 67 |
+
// driver library.
|
| 68 |
+
if (!lib) {
|
| 69 |
+
int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
|
| 70 |
+
for (int i = 0; i < n; ++i) {
|
| 71 |
+
void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
|
| 72 |
+
if (handle) {
|
| 73 |
+
lib = handle;
|
| 74 |
+
// printf("[triton] chosen %s\n", hipLibSearchPaths[i]);
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
if (!lib) {
|
| 79 |
+
PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
|
| 80 |
+
return false;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
// Resolve all symbols we are interested in.
|
| 84 |
+
dlerror(); // Clear existing errors
|
| 85 |
+
const char *error = NULL;
|
| 86 |
+
#define QUERY_EACH_FN(hipSymbolName, ...) \
|
| 87 |
+
*(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName); \
|
| 88 |
+
error = dlerror(); \
|
| 89 |
+
if (error) { \
|
| 90 |
+
PyErr_SetString(PyExc_RuntimeError, \
|
| 91 |
+
"cannot query " #hipSymbolName " from libamdhip64.so"); \
|
| 92 |
+
dlclose(lib); \
|
| 93 |
+
return false; \
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
|
| 97 |
+
|
| 98 |
+
return true;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
static inline void gpuAssert(hipError_t code, const char *file, int line) {
|
| 102 |
+
{
|
| 103 |
+
if (code != HIP_SUCCESS) {
|
| 104 |
+
{
|
| 105 |
+
const char *prefix = "Triton Error [HIP]: ";
|
| 106 |
+
const char *str = hipSymbolTable.hipGetErrorString(code);
|
| 107 |
+
char err[1024] = {0};
|
| 108 |
+
snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str);
|
| 109 |
+
PyGILState_STATE gil_state;
|
| 110 |
+
gil_state = PyGILState_Ensure();
|
| 111 |
+
PyErr_SetString(PyExc_RuntimeError, err);
|
| 112 |
+
PyGILState_Release(gil_state);
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
#define HIP_CHECK(ans) \
|
| 119 |
+
{ \
|
| 120 |
+
gpuAssert((ans), __FILE__, __LINE__); \
|
| 121 |
+
if (PyErr_Occurred()) \
|
| 122 |
+
return NULL; \
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
|
| 126 |
+
int device_id;
|
| 127 |
+
if (!PyArg_ParseTuple(args, "i", &device_id))
|
| 128 |
+
return NULL;
|
| 129 |
+
|
| 130 |
+
hipDeviceProp_tR0000 props;
|
| 131 |
+
HIP_CHECK(hipSymbolTable.hipGetDeviceProperties(&props, device_id));
|
| 132 |
+
|
| 133 |
+
// create a struct to hold device properties
|
| 134 |
+
return Py_BuildValue(
|
| 135 |
+
"{s:i, s:i, s:i, s:i, s:i, s:i, s:s, s:i}", "max_shared_mem",
|
| 136 |
+
props.sharedMemPerBlock, "max_num_regs", props.regsPerBlock,
|
| 137 |
+
"multiprocessor_count", props.multiProcessorCount, "sm_clock_rate",
|
| 138 |
+
props.clockRate, "mem_clock_rate", props.memoryClockRate, "mem_bus_width",
|
| 139 |
+
props.memoryBusWidth, "arch", props.gcnArchName, "warpSize",
|
| 140 |
+
props.warpSize);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
static PyObject *loadBinary(PyObject *self, PyObject *args) {
|
| 144 |
+
const char *name;
|
| 145 |
+
const char *data;
|
| 146 |
+
Py_ssize_t data_size;
|
| 147 |
+
int shared;
|
| 148 |
+
int device;
|
| 149 |
+
if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
|
| 150 |
+
&device)) {
|
| 151 |
+
return NULL;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
// set HIP options
|
| 155 |
+
hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
|
| 156 |
+
hipJitOptionErrorLogBuffer,
|
| 157 |
+
hipJitOptionInfoLogBufferSizeBytes,
|
| 158 |
+
hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
|
| 159 |
+
const unsigned int errbufsize = 8192;
|
| 160 |
+
const unsigned int logbufsize = 8192;
|
| 161 |
+
char _err[errbufsize];
|
| 162 |
+
char _log[logbufsize];
|
| 163 |
+
void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
|
| 164 |
+
(void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
|
| 165 |
+
|
| 166 |
+
// launch HIP Binary
|
| 167 |
+
hipModule_t mod;
|
| 168 |
+
hipFunction_t fun;
|
| 169 |
+
HIP_CHECK(hipSymbolTable.hipModuleLoadDataEx(&mod, data, 5, opt, optval))
|
| 170 |
+
HIP_CHECK(hipSymbolTable.hipModuleGetFunction(&fun, mod, name));
|
| 171 |
+
|
| 172 |
+
// get allocated registers and spilled registers from the function
|
| 173 |
+
int n_regs = 0;
|
| 174 |
+
int n_spills = 0;
|
| 175 |
+
hipSymbolTable.hipFuncGetAttribute(&n_regs, HIP_FUNC_ATTRIBUTE_NUM_REGS, fun);
|
| 176 |
+
hipSymbolTable.hipFuncGetAttribute(&n_spills,
|
| 177 |
+
HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
|
| 178 |
+
n_spills /= 4;
|
| 179 |
+
if (PyErr_Occurred()) {
|
| 180 |
+
return NULL;
|
| 181 |
+
}
|
| 182 |
+
return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs,
|
| 183 |
+
n_spills);
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
static PyMethodDef ModuleMethods[] = {
|
| 187 |
+
{"load_binary", loadBinary, METH_VARARGS,
|
| 188 |
+
"Load provided hsaco into HIP driver"},
|
| 189 |
+
{"get_device_properties", getDeviceProperties, METH_VARARGS,
|
| 190 |
+
"Get the properties for a given device"},
|
| 191 |
+
{NULL, NULL, 0, NULL} // sentinel
|
| 192 |
+
};
|
| 193 |
+
|
| 194 |
+
static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils",
|
| 195 |
+
NULL, // documentation
|
| 196 |
+
-1, // size
|
| 197 |
+
ModuleMethods};
|
| 198 |
+
|
| 199 |
+
PyMODINIT_FUNC PyInit_hip_utils(void) {
|
| 200 |
+
if (!initSymbolTable()) {
|
| 201 |
+
return NULL;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
PyObject *m = PyModule_Create(&ModuleDef);
|
| 205 |
+
if (m == NULL) {
|
| 206 |
+
return NULL;
|
| 207 |
+
}
|
| 208 |
+
PyModule_AddFunctions(m, ModuleMethods);
|
| 209 |
+
|
| 210 |
+
return m;
|
| 211 |
+
}
|
.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import functools
|
| 2 |
+
import os
|
| 3 |
+
import hashlib
|
| 4 |
+
import subprocess
|
| 5 |
+
import tempfile
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from triton.runtime.build import _build
|
| 8 |
+
from triton.runtime.cache import get_cache_manager
|
| 9 |
+
from triton.backends.compiler import GPUTarget
|
| 10 |
+
from triton.backends.driver import GPUDriver
|
| 11 |
+
|
| 12 |
+
dirname = os.path.dirname(os.path.realpath(__file__))
|
| 13 |
+
include_dir = [os.path.join(dirname, "include")]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _find_already_mmapped_dylib_on_linux(lib_name):
|
| 17 |
+
import platform
|
| 18 |
+
if platform.system() != 'Linux':
|
| 19 |
+
return None
|
| 20 |
+
|
| 21 |
+
# Use dl_iterate_phdr to walk through the list of shared libraries at runtime.
|
| 22 |
+
# See https://www.man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html for details.
|
| 23 |
+
|
| 24 |
+
import ctypes
|
| 25 |
+
from ctypes import c_char, c_int, c_size_t, c_void_p, c_char_p, POINTER
|
| 26 |
+
|
| 27 |
+
class DlPhdrInfo(ctypes.Structure):
|
| 28 |
+
_fields_ = [
|
| 29 |
+
('dlpi_addr', c_void_p),
|
| 30 |
+
('dlpi_name', c_char_p),
|
| 31 |
+
# We don't care about the remaining fields.
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
# callback_t must use POINTER(c_char) to avoid copying.
|
| 35 |
+
callback_t = ctypes.CFUNCTYPE(c_int, POINTER(DlPhdrInfo), POINTER(c_size_t), POINTER(c_char))
|
| 36 |
+
|
| 37 |
+
# Load libc and get the dl_iterate_phdr symbol.
|
| 38 |
+
try:
|
| 39 |
+
dl_iterate_phdr = ctypes.CDLL('libc.so.6').dl_iterate_phdr
|
| 40 |
+
except:
|
| 41 |
+
return None
|
| 42 |
+
# argtypes must use c_char_p to accept create_string_buffer.
|
| 43 |
+
dl_iterate_phdr.argtypes = [callback_t, c_char_p]
|
| 44 |
+
dl_iterate_phdr.restype = c_int
|
| 45 |
+
|
| 46 |
+
max_path_length = 4096
|
| 47 |
+
path = ctypes.create_string_buffer(max_path_length + 1)
|
| 48 |
+
|
| 49 |
+
# Define callback to get the loaded dylib path.
|
| 50 |
+
def callback(info, size, data):
|
| 51 |
+
dlpi_name = info.contents.dlpi_name
|
| 52 |
+
p = Path(os.fsdecode(dlpi_name))
|
| 53 |
+
if lib_name in p.name:
|
| 54 |
+
# Found the dylib; get its path.
|
| 55 |
+
ctypes.memmove(data, dlpi_name, min(max_path_length, len(dlpi_name)))
|
| 56 |
+
return 1
|
| 57 |
+
return 0
|
| 58 |
+
|
| 59 |
+
if dl_iterate_phdr(callback_t(callback), path):
|
| 60 |
+
return os.fsdecode(ctypes.string_at(path))
|
| 61 |
+
return None
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@functools.lru_cache()
|
| 65 |
+
def _get_path_to_hip_runtime_dylib():
|
| 66 |
+
lib_name = "libamdhip64.so"
|
| 67 |
+
|
| 68 |
+
# If we are told explicitly what HIP runtime dynamic library to use, obey that.
|
| 69 |
+
env_libhip_path = os.getenv("TRITON_LIBHIP_PATH")
|
| 70 |
+
if env_libhip_path:
|
| 71 |
+
if env_libhip_path.endswith(lib_name) and os.path.exists(env_libhip_path):
|
| 72 |
+
return env_libhip_path
|
| 73 |
+
raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}")
|
| 74 |
+
|
| 75 |
+
# If the shared object is already mmapped to address space, use it.
|
| 76 |
+
mmapped_path = _find_already_mmapped_dylib_on_linux(lib_name)
|
| 77 |
+
if mmapped_path:
|
| 78 |
+
if os.path.exists(mmapped_path):
|
| 79 |
+
return mmapped_path
|
| 80 |
+
raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}")
|
| 81 |
+
|
| 82 |
+
paths = []
|
| 83 |
+
|
| 84 |
+
import site
|
| 85 |
+
# First search the HIP runtime dynamic library packaged with PyTorch. It's very likely
|
| 86 |
+
# that we run Triton together with PyTorch. This makes sure we use the same dynamic
|
| 87 |
+
# library to avoid version mismatch.
|
| 88 |
+
site_packages = site.getsitepackages()
|
| 89 |
+
user_site = site.getusersitepackages()
|
| 90 |
+
if site.ENABLE_USER_SITE: # ENABLE_USER_SITE is initialized in getusersitepackages()
|
| 91 |
+
site_packages = [user_site] + site_packages
|
| 92 |
+
for path in site_packages:
|
| 93 |
+
path = os.path.join(path, "torch", "lib", lib_name)
|
| 94 |
+
if os.path.exists(path):
|
| 95 |
+
return path
|
| 96 |
+
paths.append(path)
|
| 97 |
+
|
| 98 |
+
# Then try to see if developer provides a HIP runtime dynamic library using LD_LIBARAY_PATH.
|
| 99 |
+
env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
|
| 100 |
+
if env_ld_library_path:
|
| 101 |
+
for d in env_ld_library_path.split(":"):
|
| 102 |
+
f = os.path.join(d, lib_name)
|
| 103 |
+
if os.path.exists(f):
|
| 104 |
+
return f
|
| 105 |
+
paths.append(f)
|
| 106 |
+
|
| 107 |
+
# Afterwards try to search the loader dynamic library resolution paths.
|
| 108 |
+
libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
|
| 109 |
+
# each line looks like the following:
|
| 110 |
+
# libamdhip64.so.6 (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so.6
|
| 111 |
+
# libamdhip64.so (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so
|
| 112 |
+
locs = [line.split()[-1] for line in libs.splitlines() if line.strip().endswith(lib_name)]
|
| 113 |
+
for loc in locs:
|
| 114 |
+
if os.path.exists(loc):
|
| 115 |
+
return loc
|
| 116 |
+
paths.append(loc)
|
| 117 |
+
|
| 118 |
+
# As a last resort, guess if we have it in some common installation path.
|
| 119 |
+
common_install_path = os.path.join('/opt/rocm/lib/', lib_name)
|
| 120 |
+
if os.path.exists(common_install_path):
|
| 121 |
+
return common_install_path
|
| 122 |
+
paths.append(common_install_path)
|
| 123 |
+
|
| 124 |
+
raise RuntimeError(f"cannot locate {lib_name} after attempted paths {paths}")
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def compile_module_from_src(src, name):
|
| 128 |
+
key = hashlib.sha256(src.encode("utf-8")).hexdigest()
|
| 129 |
+
cache = get_cache_manager(key)
|
| 130 |
+
cache_path = cache.get_file(f"{name}.so")
|
| 131 |
+
if cache_path is None:
|
| 132 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 133 |
+
src_path = os.path.join(tmpdir, "main.c")
|
| 134 |
+
with open(src_path, "w") as f:
|
| 135 |
+
f.write(src)
|
| 136 |
+
so = _build(name, src_path, tmpdir, [], include_dir, [])
|
| 137 |
+
with open(so, "rb") as f:
|
| 138 |
+
cache_path = cache.put(f.read(), f"{name}.so", binary=True)
|
| 139 |
+
import importlib.util
|
| 140 |
+
spec = importlib.util.spec_from_file_location(name, cache_path)
|
| 141 |
+
mod = importlib.util.module_from_spec(spec)
|
| 142 |
+
spec.loader.exec_module(mod)
|
| 143 |
+
return mod
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class HIPUtils(object):
|
| 147 |
+
|
| 148 |
+
def __new__(cls):
|
| 149 |
+
if not hasattr(cls, "instance"):
|
| 150 |
+
cls.instance = super(HIPUtils, cls).__new__(cls)
|
| 151 |
+
return cls.instance
|
| 152 |
+
|
| 153 |
+
def __init__(self):
|
| 154 |
+
libhip_path = _get_path_to_hip_runtime_dylib()
|
| 155 |
+
src = Path(os.path.join(dirname, "driver.c")).read_text()
|
| 156 |
+
# Just do a simple search and replace here instead of templates or format strings.
|
| 157 |
+
# This way we don't need to escape-quote C code curly brackets and we can replace
|
| 158 |
+
# exactly once.
|
| 159 |
+
src = src.replace('/*py_libhip_search_path*/', libhip_path, 1)
|
| 160 |
+
mod = compile_module_from_src(src, "hip_utils")
|
| 161 |
+
self.load_binary = mod.load_binary
|
| 162 |
+
self.get_device_properties = mod.get_device_properties
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# -------------------- Launcher ----------------------------
|
| 166 |
+
def ty_to_cpp(ty):
|
| 167 |
+
if ty[0] == '*':
|
| 168 |
+
return "hipDeviceptr_t"
|
| 169 |
+
return {
|
| 170 |
+
"i1": "int32_t",
|
| 171 |
+
"i8": "int8_t",
|
| 172 |
+
"i16": "int16_t",
|
| 173 |
+
"i32": "int32_t",
|
| 174 |
+
"i64": "int64_t",
|
| 175 |
+
"u1": "uint32_t",
|
| 176 |
+
"u8": "uint8_t",
|
| 177 |
+
"u16": "uint16_t",
|
| 178 |
+
"u32": "uint32_t",
|
| 179 |
+
"u64": "uint64_t",
|
| 180 |
+
"fp16": "float",
|
| 181 |
+
"bf16": "float",
|
| 182 |
+
"fp32": "float",
|
| 183 |
+
"f32": "float",
|
| 184 |
+
"fp64": "double",
|
| 185 |
+
}[ty]
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def make_launcher(constants, signature, ids, warp_size):
|
| 189 |
+
start_desc = len(signature)
|
| 190 |
+
#signature = generate_cu_signature(constants, signature, ids)
|
| 191 |
+
arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
|
| 192 |
+
|
| 193 |
+
def _extracted_type(ty):
|
| 194 |
+
if ty[0] == '*':
|
| 195 |
+
return "PyObject*"
|
| 196 |
+
return {
|
| 197 |
+
'i1': 'int32_t',
|
| 198 |
+
'i8': 'int8_t',
|
| 199 |
+
'i16': 'int16_t',
|
| 200 |
+
'i32': 'int32_t',
|
| 201 |
+
'i64': 'int64_t',
|
| 202 |
+
'u1': 'uint32_t',
|
| 203 |
+
'u8': 'uint8_t',
|
| 204 |
+
'u16': 'uint16_t',
|
| 205 |
+
'u32': 'uint32_t',
|
| 206 |
+
'u64': 'uint64_t',
|
| 207 |
+
'fp16': 'float',
|
| 208 |
+
'bf16': 'float',
|
| 209 |
+
'fp32': 'float',
|
| 210 |
+
'f32': 'float',
|
| 211 |
+
'fp64': 'double',
|
| 212 |
+
}[ty]
|
| 213 |
+
|
| 214 |
+
def format_of(ty):
|
| 215 |
+
return {
|
| 216 |
+
"PyObject*": "O",
|
| 217 |
+
"float": "f",
|
| 218 |
+
"double": "d",
|
| 219 |
+
"long": "l",
|
| 220 |
+
"int8_t": "b",
|
| 221 |
+
"int16_t": "h",
|
| 222 |
+
"int32_t": "i",
|
| 223 |
+
"int64_t": "l",
|
| 224 |
+
"uint8_t": "B",
|
| 225 |
+
"uint16_t": "H",
|
| 226 |
+
"uint32_t": "I",
|
| 227 |
+
"uint64_t": "K",
|
| 228 |
+
}[ty]
|
| 229 |
+
|
| 230 |
+
args_format = ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])
|
| 231 |
+
format = "iiiKKOOOO" + args_format
|
| 232 |
+
args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
|
| 233 |
+
|
| 234 |
+
libhip_path = _get_path_to_hip_runtime_dylib()
|
| 235 |
+
|
| 236 |
+
# generate glue code
|
| 237 |
+
params = [i for i in signature.keys() if i not in constants]
|
| 238 |
+
src = f"""
|
| 239 |
+
#define __HIP_PLATFORM_AMD__
|
| 240 |
+
#include <hip/hip_runtime.h>
|
| 241 |
+
#include <Python.h>
|
| 242 |
+
#include <dlfcn.h>
|
| 243 |
+
#include <stdbool.h>
|
| 244 |
+
#include <dlfcn.h>
|
| 245 |
+
|
| 246 |
+
// The list of paths to search for the HIP runtime library. The caller Python
|
| 247 |
+
// code should substitute the search path placeholder.
|
| 248 |
+
static const char *hipLibSearchPaths[] = {{"{libhip_path}"}};
|
| 249 |
+
|
| 250 |
+
// The list of HIP dynamic library symbols and their signature we are interested
|
| 251 |
+
// in this file.
|
| 252 |
+
#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN) \\
|
| 253 |
+
FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError) \\
|
| 254 |
+
FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f, \\
|
| 255 |
+
unsigned int gridDimX, unsigned int gridDimY, \\
|
| 256 |
+
unsigned int gridDimZ, unsigned int blockDimX, \\
|
| 257 |
+
unsigned int blockDimY, unsigned int blockDimZ, \\
|
| 258 |
+
unsigned int sharedMemBytes, hipStream_t stream, \\
|
| 259 |
+
void **kernelParams, void **extra) \\
|
| 260 |
+
FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data, \\
|
| 261 |
+
hipPointer_attribute attribute, hipDeviceptr_t ptr)
|
| 262 |
+
|
| 263 |
+
// The HIP symbol table for holding resolved dynamic library symbols.
|
| 264 |
+
struct HIPSymbolTable {{
|
| 265 |
+
#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...) \\
|
| 266 |
+
hipError_t (*hipSymbolName)(__VA_ARGS__);
|
| 267 |
+
#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...) \\
|
| 268 |
+
const char *(*hipSymbolName)(__VA_ARGS__);
|
| 269 |
+
|
| 270 |
+
HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
|
| 271 |
+
}};
|
| 272 |
+
|
| 273 |
+
static struct HIPSymbolTable hipSymbolTable;
|
| 274 |
+
|
| 275 |
+
bool initSymbolTable() {{
|
| 276 |
+
// Use the HIP runtime library loaded into the existing process if it exits.
|
| 277 |
+
void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
|
| 278 |
+
if (lib) {{
|
| 279 |
+
// printf("[triton] chosen loaded libamdhip64.so in the process\\n");
|
| 280 |
+
}}
|
| 281 |
+
|
| 282 |
+
// Otherwise, go through the list of search paths to dlopen the first HIP
|
| 283 |
+
// driver library.
|
| 284 |
+
if (!lib) {{
|
| 285 |
+
int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
|
| 286 |
+
for (int i = 0; i < n; ++i) {{
|
| 287 |
+
void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
|
| 288 |
+
if (handle) {{
|
| 289 |
+
lib = handle;
|
| 290 |
+
// printf("[triton] chosen %s\\n", hipLibSearchPaths[i]);
|
| 291 |
+
}}
|
| 292 |
+
}}
|
| 293 |
+
}}
|
| 294 |
+
if (!lib) {{
|
| 295 |
+
PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
|
| 296 |
+
return false;
|
| 297 |
+
}}
|
| 298 |
+
|
| 299 |
+
// Resolve all symbols we are interested in.
|
| 300 |
+
dlerror(); // Clear existing errors
|
| 301 |
+
const char *error = NULL;
|
| 302 |
+
#define QUERY_EACH_FN(hipSymbolName, ...) \\
|
| 303 |
+
*(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName); \\
|
| 304 |
+
error = dlerror(); \\
|
| 305 |
+
if (error) {{ \\
|
| 306 |
+
PyErr_SetString(PyExc_RuntimeError, \\
|
| 307 |
+
"cannot query " #hipSymbolName " from libamdhip64.so"); \\
|
| 308 |
+
dlclose(lib); \\
|
| 309 |
+
return false; \\
|
| 310 |
+
}}
|
| 311 |
+
|
| 312 |
+
HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
|
| 313 |
+
|
| 314 |
+
return true;
|
| 315 |
+
}}
|
| 316 |
+
|
| 317 |
+
static inline void gpuAssert(hipError_t code, const char *file, int line)
|
| 318 |
+
{{
|
| 319 |
+
if (code != HIP_SUCCESS)
|
| 320 |
+
{{
|
| 321 |
+
const char* prefix = "Triton Error [HIP]: ";
|
| 322 |
+
const char* str = hipSymbolTable.hipGetErrorString(code);
|
| 323 |
+
char err[1024] = {{0}};
|
| 324 |
+
snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
|
| 325 |
+
PyErr_SetString(PyExc_RuntimeError, err);
|
| 326 |
+
}}
|
| 327 |
+
}}
|
| 328 |
+
|
| 329 |
+
#define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
|
| 330 |
+
|
| 331 |
+
static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
|
| 332 |
+
// printf("_launch hip kernel\\n");
|
| 333 |
+
void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};
|
| 334 |
+
if (gridX*gridY*gridZ > 0) {{
|
| 335 |
+
HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
|
| 336 |
+
}}
|
| 337 |
+
}}
|
| 338 |
+
|
| 339 |
+
typedef struct _DevicePtrInfo {{
|
| 340 |
+
hipDeviceptr_t dev_ptr;
|
| 341 |
+
bool valid;
|
| 342 |
+
}} DevicePtrInfo;
|
| 343 |
+
|
| 344 |
+
static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
|
| 345 |
+
DevicePtrInfo ptr_info;
|
| 346 |
+
ptr_info.dev_ptr = 0;
|
| 347 |
+
ptr_info.valid = true;
|
| 348 |
+
if (PyLong_Check(obj)) {{
|
| 349 |
+
ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
|
| 350 |
+
return ptr_info;
|
| 351 |
+
}}
|
| 352 |
+
if (obj == Py_None) {{
|
| 353 |
+
// valid nullptr
|
| 354 |
+
return ptr_info;
|
| 355 |
+
}}
|
| 356 |
+
PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
|
| 357 |
+
if(ptr){{
|
| 358 |
+
PyObject *empty_tuple = PyTuple_New(0);
|
| 359 |
+
PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
|
| 360 |
+
Py_DECREF(empty_tuple);
|
| 361 |
+
Py_DECREF(ptr);
|
| 362 |
+
if (!PyLong_Check(ret)) {{
|
| 363 |
+
PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
|
| 364 |
+
ptr_info.valid = false;
|
| 365 |
+
return ptr_info;
|
| 366 |
+
}}
|
| 367 |
+
ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
|
| 368 |
+
if(!ptr_info.dev_ptr)
|
| 369 |
+
return ptr_info;
|
| 370 |
+
uint64_t dev_ptr;
|
| 371 |
+
hipError_t status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
|
| 372 |
+
if (status == hipErrorInvalidValue) {{
|
| 373 |
+
PyErr_Format(PyExc_ValueError,
|
| 374 |
+
"Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
|
| 375 |
+
ptr_info.valid = false;
|
| 376 |
+
}}
|
| 377 |
+
ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
|
| 378 |
+
Py_DECREF(ret);
|
| 379 |
+
return ptr_info;
|
| 380 |
+
}}
|
| 381 |
+
PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
|
| 382 |
+
return ptr_info;
|
| 383 |
+
}}
|
| 384 |
+
|
| 385 |
+
static PyObject* launch(PyObject* self, PyObject* args) {{
|
| 386 |
+
// printf("launch\\n");
|
| 387 |
+
int gridX, gridY, gridZ;
|
| 388 |
+
uint64_t _stream;
|
| 389 |
+
uint64_t _function;
|
| 390 |
+
PyObject *launch_enter_hook = NULL;
|
| 391 |
+
PyObject *launch_exit_hook = NULL;
|
| 392 |
+
PyObject *kernel_metadata = NULL;
|
| 393 |
+
PyObject *launch_metadata = NULL;
|
| 394 |
+
{' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
|
| 395 |
+
if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function,
|
| 396 |
+
&kernel_metadata, &launch_metadata,
|
| 397 |
+
&launch_enter_hook, &launch_exit_hook {args_list})) {{
|
| 398 |
+
return NULL;
|
| 399 |
+
}}
|
| 400 |
+
|
| 401 |
+
// extract kernel metadata
|
| 402 |
+
int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
|
| 403 |
+
if (!PyArg_ParseTuple(kernel_metadata, \"iiiiii\", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {{
|
| 404 |
+
return NULL;
|
| 405 |
+
}}
|
| 406 |
+
// extract launch metadata
|
| 407 |
+
if (launch_enter_hook != Py_None){{
|
| 408 |
+
PyObject* args = Py_BuildValue("(O)", launch_metadata);
|
| 409 |
+
PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
|
| 410 |
+
Py_DECREF(args);
|
| 411 |
+
if (!ret)
|
| 412 |
+
return NULL;
|
| 413 |
+
}}
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
// raise exception asap
|
| 417 |
+
{"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
|
| 418 |
+
_launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function{', ' + ', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items()) if len(signature) > 0 else ''});
|
| 419 |
+
|
| 420 |
+
if(launch_exit_hook != Py_None){{
|
| 421 |
+
PyObject* args = Py_BuildValue("(O)", launch_metadata);
|
| 422 |
+
PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
|
| 423 |
+
Py_DECREF(args);
|
| 424 |
+
if (!ret)
|
| 425 |
+
return NULL;
|
| 426 |
+
}}
|
| 427 |
+
|
| 428 |
+
if(PyErr_Occurred()) {{
|
| 429 |
+
return NULL;
|
| 430 |
+
}}
|
| 431 |
+
// return None
|
| 432 |
+
Py_INCREF(Py_None);
|
| 433 |
+
return Py_None;
|
| 434 |
+
}}
|
| 435 |
+
|
| 436 |
+
static PyMethodDef ModuleMethods[] = {{
|
| 437 |
+
{{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
|
| 438 |
+
{{NULL, NULL, 0, NULL}} // sentinel
|
| 439 |
+
}};
|
| 440 |
+
|
| 441 |
+
static struct PyModuleDef ModuleDef = {{
|
| 442 |
+
PyModuleDef_HEAD_INIT,
|
| 443 |
+
\"__triton_launcher\",
|
| 444 |
+
NULL, //documentation
|
| 445 |
+
-1, //size
|
| 446 |
+
ModuleMethods
|
| 447 |
+
}};
|
| 448 |
+
|
| 449 |
+
PyMODINIT_FUNC PyInit___triton_launcher(void) {{
|
| 450 |
+
if (!initSymbolTable()) {{
|
| 451 |
+
return NULL;
|
| 452 |
+
}}
|
| 453 |
+
PyObject *m = PyModule_Create(&ModuleDef);
|
| 454 |
+
if(m == NULL) {{
|
| 455 |
+
return NULL;
|
| 456 |
+
}}
|
| 457 |
+
PyModule_AddFunctions(m, ModuleMethods);
|
| 458 |
+
return m;
|
| 459 |
+
}}
|
| 460 |
+
"""
|
| 461 |
+
return src
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
class HIPLauncher(object):
|
| 465 |
+
|
| 466 |
+
def __init__(self, src, metadata):
|
| 467 |
+
ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
|
| 468 |
+
constants = src.constants if hasattr(src, "constants") else dict()
|
| 469 |
+
cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
|
| 470 |
+
constants = {cst_key(key): value for key, value in constants.items()}
|
| 471 |
+
signature = {cst_key(key): value for key, value in src.signature.items()}
|
| 472 |
+
src = make_launcher(constants, signature, ids, metadata.warp_size)
|
| 473 |
+
mod = compile_module_from_src(src, "__triton_launcher")
|
| 474 |
+
self.launch = mod.launch
|
| 475 |
+
|
| 476 |
+
def __call__(self, *args, **kwargs):
|
| 477 |
+
self.launch(*args, **kwargs)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
class HIPDriver(GPUDriver):
|
| 481 |
+
|
| 482 |
+
def __init__(self):
|
| 483 |
+
super().__init__()
|
| 484 |
+
self.utils = HIPUtils()
|
| 485 |
+
self.launcher_cls = HIPLauncher
|
| 486 |
+
|
| 487 |
+
@staticmethod
|
| 488 |
+
def is_active():
|
| 489 |
+
import torch
|
| 490 |
+
return torch.version.hip is not None
|
| 491 |
+
|
| 492 |
+
def get_current_target(self):
|
| 493 |
+
device = self.get_current_device()
|
| 494 |
+
device_properties = self.utils.get_device_properties(device)
|
| 495 |
+
arch = device_properties['arch']
|
| 496 |
+
warp_size = device_properties['warpSize']
|
| 497 |
+
return GPUTarget("hip", arch.split(':')[0], warp_size)
|
.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
| 3 |
+
|
| 4 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 5 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 6 |
+
in the Software without restriction, including without limitation the rights
|
| 7 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 8 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 9 |
+
furnished to do so, subject to the following conditions:
|
| 10 |
+
|
| 11 |
+
The above copyright notice and this permission notice shall be included in
|
| 12 |
+
all copies or substantial portions of the Software.
|
| 13 |
+
|
| 14 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 15 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 16 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 17 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 18 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 19 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 20 |
+
THE SOFTWARE.
|
| 21 |
+
*/
|
| 22 |
+
|
| 23 |
+
//! HIP = Heterogeneous-compute Interface for Portability
|
| 24 |
+
//!
|
| 25 |
+
//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
|
| 26 |
+
//! through either AMD CLANG or NVCC. Key features tend to be in the spirit
|
| 27 |
+
//! and terminology of CUDA, but with a portable path to other accelerators as well:
|
| 28 |
+
//
|
| 29 |
+
//! Both paths support rich C++ features including classes, templates, lambdas, etc.
|
| 30 |
+
//! Runtime API is C
|
| 31 |
+
//! Memory management is based on pure pointers and resembles malloc/free/copy.
|
| 32 |
+
//
|
| 33 |
+
//! hip_runtime.h : includes everything in hip_api.h, plus math builtins and kernel launch
|
| 34 |
+
//! macros. hip_runtime_api.h : Defines HIP API. This is a C header file and does not use any C++
|
| 35 |
+
//! features.
|
| 36 |
+
|
| 37 |
+
#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_H
|
| 38 |
+
#define HIP_INCLUDE_HIP_HIP_RUNTIME_H
|
| 39 |
+
|
| 40 |
+
#if __HIP_DEVICE_COMPILE__ && !__GFX7__ && !__GFX8__ && !__GFX9__ && __AMDGCN_WAVEFRONT_SIZE == 64
|
| 41 |
+
#error HIP is not supported on the specified GPU ARCH with wavefront size 64
|
| 42 |
+
#endif
|
| 43 |
+
|
| 44 |
+
#if !defined(__HIPCC_RTC__)
|
| 45 |
+
// Some standard header files, these are included by hc.hpp and so want to make them avail on both
|
| 46 |
+
// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
|
| 47 |
+
// on NVCC path:
|
| 48 |
+
#include <stdint.h>
|
| 49 |
+
#include <stdio.h>
|
| 50 |
+
#include <stdlib.h>
|
| 51 |
+
#include <assert.h>
|
| 52 |
+
|
| 53 |
+
#if __cplusplus > 199711L
|
| 54 |
+
#include <thread>
|
| 55 |
+
#endif
|
| 56 |
+
#endif // !defined(__HIPCC_RTC__)
|
| 57 |
+
|
| 58 |
+
#include <hip/hip_version.h>
|
| 59 |
+
#include <hip/hip_common.h>
|
| 60 |
+
|
| 61 |
+
#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
|
| 62 |
+
#include <hip/amd_detail/amd_hip_runtime.h>
|
| 63 |
+
#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
|
| 64 |
+
#include <hip/nvidia_detail/nvidia_hip_runtime.h>
|
| 65 |
+
#else
|
| 66 |
+
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
|
| 67 |
+
#endif
|
| 68 |
+
|
| 69 |
+
#if !defined(__HIPCC_RTC__)
|
| 70 |
+
#include <hip/hip_runtime_api.h>
|
| 71 |
+
#include <hip/library_types.h>
|
| 72 |
+
#endif // !defined(__HIPCC_RTC__)
|
| 73 |
+
#include <hip/hip_vector_types.h>
|
| 74 |
+
|
| 75 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
| 3 |
+
|
| 4 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 5 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 6 |
+
in the Software without restriction, including without limitation the rights
|
| 7 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 8 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 9 |
+
furnished to do so, subject to the following conditions:
|
| 10 |
+
|
| 11 |
+
The above copyright notice and this permission notice shall be included in
|
| 12 |
+
all copies or substantial portions of the Software.
|
| 13 |
+
|
| 14 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 15 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 16 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 17 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 18 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 19 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 20 |
+
THE SOFTWARE.
|
| 21 |
+
*/
|
| 22 |
+
|
| 23 |
+
#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
|
| 24 |
+
#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H
|
| 25 |
+
|
| 26 |
+
#if defined(__clang__)
|
| 27 |
+
#pragma clang diagnostic push
|
| 28 |
+
#pragma clang diagnostic ignored "-Wreserved-identifier"
|
| 29 |
+
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
|
| 30 |
+
#pragma clang diagnostic ignored "-Wc++98-compat"
|
| 31 |
+
#endif
|
| 32 |
+
|
| 33 |
+
#if !defined(__HIPCC_RTC__)
|
| 34 |
+
#include <hip/hip_common.h>
|
| 35 |
+
#endif
|
| 36 |
+
|
| 37 |
+
#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
|
| 38 |
+
#include "texture_types.h"
|
| 39 |
+
#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
|
| 40 |
+
/*******************************************************************************
|
| 41 |
+
* *
|
| 42 |
+
* *
|
| 43 |
+
* *
|
| 44 |
+
*******************************************************************************/
|
| 45 |
+
#if !defined(__HIPCC_RTC__)
|
| 46 |
+
#include <limits.h>
|
| 47 |
+
#include <hip/channel_descriptor.h>
|
| 48 |
+
#include <hip/driver_types.h>
|
| 49 |
+
#endif // !defined(__HIPCC_RTC__)
|
| 50 |
+
|
| 51 |
+
#define hipTextureType1D 0x01
|
| 52 |
+
#define hipTextureType2D 0x02
|
| 53 |
+
#define hipTextureType3D 0x03
|
| 54 |
+
#define hipTextureTypeCubemap 0x0C
|
| 55 |
+
#define hipTextureType1DLayered 0xF1
|
| 56 |
+
#define hipTextureType2DLayered 0xF2
|
| 57 |
+
#define hipTextureTypeCubemapLayered 0xFC
|
| 58 |
+
|
| 59 |
+
/**
|
| 60 |
+
* Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
|
| 61 |
+
*/
|
| 62 |
+
#define HIP_IMAGE_OBJECT_SIZE_DWORD 12
|
| 63 |
+
#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
|
| 64 |
+
#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
|
| 65 |
+
#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
|
| 66 |
+
|
| 67 |
+
/**
|
| 68 |
+
* An opaque value that represents a hip texture object
|
| 69 |
+
*/
|
| 70 |
+
struct __hip_texture;
|
| 71 |
+
typedef struct __hip_texture* hipTextureObject_t;
|
| 72 |
+
|
| 73 |
+
/**
|
| 74 |
+
* hip texture address modes
|
| 75 |
+
*/
|
| 76 |
+
enum hipTextureAddressMode {
|
| 77 |
+
hipAddressModeWrap = 0,
|
| 78 |
+
hipAddressModeClamp = 1,
|
| 79 |
+
hipAddressModeMirror = 2,
|
| 80 |
+
hipAddressModeBorder = 3
|
| 81 |
+
};
|
| 82 |
+
|
| 83 |
+
/**
|
| 84 |
+
* hip texture filter modes
|
| 85 |
+
*/
|
| 86 |
+
enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
|
| 87 |
+
|
| 88 |
+
/**
|
| 89 |
+
* hip texture read modes
|
| 90 |
+
*/
|
| 91 |
+
enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
|
| 92 |
+
|
| 93 |
+
/**
|
| 94 |
+
* hip texture reference
|
| 95 |
+
*/
|
| 96 |
+
typedef struct textureReference {
|
| 97 |
+
int normalized;
|
| 98 |
+
enum hipTextureReadMode readMode;// used only for driver API's
|
| 99 |
+
enum hipTextureFilterMode filterMode;
|
| 100 |
+
enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
|
| 101 |
+
struct hipChannelFormatDesc channelDesc;
|
| 102 |
+
int sRGB; // Perform sRGB->linear conversion during texture read
|
| 103 |
+
unsigned int maxAnisotropy; // Limit to the anisotropy ratio
|
| 104 |
+
enum hipTextureFilterMode mipmapFilterMode;
|
| 105 |
+
float mipmapLevelBias;
|
| 106 |
+
float minMipmapLevelClamp;
|
| 107 |
+
float maxMipmapLevelClamp;
|
| 108 |
+
|
| 109 |
+
hipTextureObject_t textureObject;
|
| 110 |
+
int numChannels;
|
| 111 |
+
enum hipArray_Format format;
|
| 112 |
+
}textureReference;
|
| 113 |
+
|
| 114 |
+
/**
|
| 115 |
+
* hip texture descriptor
|
| 116 |
+
*/
|
| 117 |
+
typedef struct hipTextureDesc {
|
| 118 |
+
enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
|
| 119 |
+
enum hipTextureFilterMode filterMode;
|
| 120 |
+
enum hipTextureReadMode readMode;
|
| 121 |
+
int sRGB; // Perform sRGB->linear conversion during texture read
|
| 122 |
+
float borderColor[4];
|
| 123 |
+
int normalizedCoords;
|
| 124 |
+
unsigned int maxAnisotropy;
|
| 125 |
+
enum hipTextureFilterMode mipmapFilterMode;
|
| 126 |
+
float mipmapLevelBias;
|
| 127 |
+
float minMipmapLevelClamp;
|
| 128 |
+
float maxMipmapLevelClamp;
|
| 129 |
+
}hipTextureDesc;
|
| 130 |
+
|
| 131 |
+
#if __cplusplus
|
| 132 |
+
|
| 133 |
+
/*******************************************************************************
|
| 134 |
+
* *
|
| 135 |
+
* *
|
| 136 |
+
* *
|
| 137 |
+
*******************************************************************************/
|
| 138 |
+
#if __HIP__
|
| 139 |
+
#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
|
| 140 |
+
#else
|
| 141 |
+
#define __HIP_TEXTURE_ATTRIB
|
| 142 |
+
#endif
|
| 143 |
+
|
| 144 |
+
typedef textureReference* hipTexRef;
|
| 145 |
+
|
| 146 |
+
template <class T, int texType = hipTextureType1D,
|
| 147 |
+
enum hipTextureReadMode mode = hipReadModeElementType>
|
| 148 |
+
struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
|
| 149 |
+
texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
|
| 150 |
+
enum hipTextureAddressMode aMode = hipAddressModeClamp) {
|
| 151 |
+
normalized = norm;
|
| 152 |
+
readMode = mode;
|
| 153 |
+
filterMode = fMode;
|
| 154 |
+
addressMode[0] = aMode;
|
| 155 |
+
addressMode[1] = aMode;
|
| 156 |
+
addressMode[2] = aMode;
|
| 157 |
+
channelDesc = hipCreateChannelDesc<T>();
|
| 158 |
+
sRGB = 0;
|
| 159 |
+
textureObject = nullptr;
|
| 160 |
+
maxAnisotropy = 0;
|
| 161 |
+
mipmapLevelBias = 0;
|
| 162 |
+
minMipmapLevelClamp = 0;
|
| 163 |
+
maxMipmapLevelClamp = 0;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
|
| 167 |
+
struct hipChannelFormatDesc desc) {
|
| 168 |
+
normalized = norm;
|
| 169 |
+
readMode = mode;
|
| 170 |
+
filterMode = fMode;
|
| 171 |
+
addressMode[0] = aMode;
|
| 172 |
+
addressMode[1] = aMode;
|
| 173 |
+
addressMode[2] = aMode;
|
| 174 |
+
channelDesc = desc;
|
| 175 |
+
sRGB = 0;
|
| 176 |
+
textureObject = nullptr;
|
| 177 |
+
maxAnisotropy = 0;
|
| 178 |
+
mipmapLevelBias = 0;
|
| 179 |
+
minMipmapLevelClamp = 0;
|
| 180 |
+
maxMipmapLevelClamp = 0;
|
| 181 |
+
}
|
| 182 |
+
};
|
| 183 |
+
|
| 184 |
+
#endif /* __cplusplus */
|
| 185 |
+
|
| 186 |
+
#else
|
| 187 |
+
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
|
| 188 |
+
#endif
|
| 189 |
+
|
| 190 |
+
#if defined(__clang__)
|
| 191 |
+
#pragma clang diagnostic pop
|
| 192 |
+
#endif
|
| 193 |
+
|
| 194 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (195 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc
ADDED
|
Binary file (25.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc
ADDED
|
Binary file (22.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#include <cuda_stdint.h>
|
| 51 |
+
|
| 52 |
+
#if !defined(_CUPTI_OPENACC_H_)
|
| 53 |
+
#define _CUPTI_OPENACC_H_
|
| 54 |
+
|
| 55 |
+
#ifndef CUPTIAPI
|
| 56 |
+
#ifdef _WIN32
|
| 57 |
+
#define CUPTIAPI __stdcall
|
| 58 |
+
#else
|
| 59 |
+
#define CUPTIAPI
|
| 60 |
+
#endif
|
| 61 |
+
#endif
|
| 62 |
+
|
| 63 |
+
#if defined(__LP64__)
|
| 64 |
+
#define CUPTILP64 1
|
| 65 |
+
#elif defined(_WIN64)
|
| 66 |
+
#define CUPTILP64 1
|
| 67 |
+
#else
|
| 68 |
+
#undef CUPTILP64
|
| 69 |
+
#endif
|
| 70 |
+
|
| 71 |
+
#if defined(__cplusplus)
|
| 72 |
+
extern "C" {
|
| 73 |
+
#endif
|
| 74 |
+
|
| 75 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 76 |
+
#pragma GCC visibility push(default)
|
| 77 |
+
#endif
|
| 78 |
+
|
| 79 |
+
/**
|
| 80 |
+
* \brief Initialize OpenACC support
|
| 81 |
+
*
|
| 82 |
+
* \param profRegister function of type acc_prof_reg as obtained from acc_register_library
|
| 83 |
+
* \param profUnregister function of type acc_prof_reg as obtained from acc_register_library
|
| 84 |
+
* \param profLookup function of type acc_prof_lookup as obtained from acc_register_library
|
| 85 |
+
*/
|
| 86 |
+
CUptiResult CUPTIAPI
|
| 87 |
+
cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup);
|
| 88 |
+
|
| 89 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 90 |
+
#pragma GCC visibility pop
|
| 91 |
+
#endif
|
| 92 |
+
|
| 93 |
+
#if defined(__cplusplus)
|
| 94 |
+
}
|
| 95 |
+
#endif
|
| 96 |
+
|
| 97 |
+
#endif /*_CUPTI_OPENACC_H_*/
|
| 98 |
+
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#include <cuda_stdint.h>
|
| 51 |
+
#include "Openmp/omp-tools.h"
|
| 52 |
+
|
| 53 |
+
#if !defined(_CUPTI_OPENMP_H_)
|
| 54 |
+
#define _CUPTI_OPENMP_H_
|
| 55 |
+
|
| 56 |
+
#ifndef CUPTIAPI
|
| 57 |
+
#ifdef _WIN32
|
| 58 |
+
#define CUPTIAPI __stdcall
|
| 59 |
+
#else
|
| 60 |
+
#define CUPTIAPI
|
| 61 |
+
#endif
|
| 62 |
+
#endif
|
| 63 |
+
|
| 64 |
+
#if defined(__LP64__)
|
| 65 |
+
#define CUPTILP64 1
|
| 66 |
+
#elif defined(_WIN64)
|
| 67 |
+
#define CUPTILP64 1
|
| 68 |
+
#else
|
| 69 |
+
#undef CUPTILP64
|
| 70 |
+
#endif
|
| 71 |
+
|
| 72 |
+
#if defined(__cplusplus)
|
| 73 |
+
extern "C" {
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 77 |
+
#pragma GCC visibility push(default)
|
| 78 |
+
#endif
|
| 79 |
+
|
| 80 |
+
/**
|
| 81 |
+
* \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
|
| 82 |
+
*
|
| 83 |
+
*/
|
| 84 |
+
int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
|
| 85 |
+
|
| 86 |
+
/**
|
| 87 |
+
* \brief Initialize OPENMP support
|
| 88 |
+
*
|
| 89 |
+
*/
|
| 90 |
+
int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
|
| 91 |
+
|
| 92 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 93 |
+
#pragma GCC visibility pop
|
| 94 |
+
#endif
|
| 95 |
+
|
| 96 |
+
#if defined(__cplusplus)
|
| 97 |
+
}
|
| 98 |
+
#endif
|
| 99 |
+
|
| 100 |
+
#endif /*_CUPTI_OPENMP_H_*/
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h
ADDED
|
@@ -0,0 +1,1083 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* include/50/omp-tools.h.var
|
| 3 |
+
*/
|
| 4 |
+
|
| 5 |
+
//===----------------------------------------------------------------------===//
|
| 6 |
+
//
|
| 7 |
+
// The LLVM Compiler Infrastructure
|
| 8 |
+
//
|
| 9 |
+
// This file is dual licensed under the MIT and the University of Illinois Open
|
| 10 |
+
// Source Licenses. See LICENSE.txt for details.
|
| 11 |
+
//
|
| 12 |
+
//===----------------------------------------------------------------------===//
|
| 13 |
+
|
| 14 |
+
#ifndef __OMPT__
|
| 15 |
+
#define __OMPT__
|
| 16 |
+
|
| 17 |
+
/*****************************************************************************
|
| 18 |
+
* system include files
|
| 19 |
+
*****************************************************************************/
|
| 20 |
+
|
| 21 |
+
#include <stdint.h>
|
| 22 |
+
#include <stddef.h>
|
| 23 |
+
|
| 24 |
+
/*****************************************************************************
|
| 25 |
+
* iteration macros
|
| 26 |
+
*****************************************************************************/
|
| 27 |
+
|
| 28 |
+
#define FOREACH_OMPT_INQUIRY_FN(macro) \
|
| 29 |
+
macro (ompt_enumerate_states) \
|
| 30 |
+
macro (ompt_enumerate_mutex_impls) \
|
| 31 |
+
\
|
| 32 |
+
macro (ompt_set_callback) \
|
| 33 |
+
macro (ompt_get_callback) \
|
| 34 |
+
\
|
| 35 |
+
macro (ompt_get_state) \
|
| 36 |
+
\
|
| 37 |
+
macro (ompt_get_parallel_info) \
|
| 38 |
+
macro (ompt_get_task_info) \
|
| 39 |
+
macro (ompt_get_task_memory) \
|
| 40 |
+
macro (ompt_get_thread_data) \
|
| 41 |
+
macro (ompt_get_unique_id) \
|
| 42 |
+
macro (ompt_finalize_tool) \
|
| 43 |
+
\
|
| 44 |
+
macro(ompt_get_num_procs) \
|
| 45 |
+
macro(ompt_get_num_places) \
|
| 46 |
+
macro(ompt_get_place_proc_ids) \
|
| 47 |
+
macro(ompt_get_place_num) \
|
| 48 |
+
macro(ompt_get_partition_place_nums) \
|
| 49 |
+
macro(ompt_get_proc_id) \
|
| 50 |
+
\
|
| 51 |
+
macro(ompt_get_target_info) \
|
| 52 |
+
macro(ompt_get_num_devices)
|
| 53 |
+
|
| 54 |
+
#define FOREACH_OMPT_STATE(macro) \
|
| 55 |
+
\
|
| 56 |
+
/* first available state */ \
|
| 57 |
+
macro (ompt_state_undefined, 0x102) /* undefined thread state */ \
|
| 58 |
+
\
|
| 59 |
+
/* work states (0..15) */ \
|
| 60 |
+
macro (ompt_state_work_serial, 0x000) /* working outside parallel */ \
|
| 61 |
+
macro (ompt_state_work_parallel, 0x001) /* working within parallel */ \
|
| 62 |
+
macro (ompt_state_work_reduction, 0x002) /* performing a reduction */ \
|
| 63 |
+
\
|
| 64 |
+
/* barrier wait states (16..31) */ \
|
| 65 |
+
macro (ompt_state_wait_barrier, 0x010) /* waiting at a barrier */ \
|
| 66 |
+
macro (ompt_state_wait_barrier_implicit_parallel, 0x011) \
|
| 67 |
+
/* implicit barrier at the end of parallel region */\
|
| 68 |
+
macro (ompt_state_wait_barrier_implicit_workshare, 0x012) \
|
| 69 |
+
/* implicit barrier at the end of worksharing */ \
|
| 70 |
+
macro (ompt_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \
|
| 71 |
+
macro (ompt_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \
|
| 72 |
+
\
|
| 73 |
+
/* task wait states (32..63) */ \
|
| 74 |
+
macro (ompt_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \
|
| 75 |
+
macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */ \
|
| 76 |
+
\
|
| 77 |
+
/* mutex wait states (64..127) */ \
|
| 78 |
+
macro (ompt_state_wait_mutex, 0x040) \
|
| 79 |
+
macro (ompt_state_wait_lock, 0x041) /* waiting for lock */ \
|
| 80 |
+
macro (ompt_state_wait_critical, 0x042) /* waiting for critical */ \
|
| 81 |
+
macro (ompt_state_wait_atomic, 0x043) /* waiting for atomic */ \
|
| 82 |
+
macro (ompt_state_wait_ordered, 0x044) /* waiting for ordered */ \
|
| 83 |
+
\
|
| 84 |
+
/* target wait states (128..255) */ \
|
| 85 |
+
macro (ompt_state_wait_target, 0x080) /* waiting for target region */ \
|
| 86 |
+
macro (ompt_state_wait_target_map, 0x081) /* waiting for target data mapping operation */ \
|
| 87 |
+
macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */ \
|
| 88 |
+
\
|
| 89 |
+
/* misc (256..511) */ \
|
| 90 |
+
macro (ompt_state_idle, 0x100) /* waiting for work */ \
|
| 91 |
+
macro (ompt_state_overhead, 0x101) /* overhead excluding wait states */ \
|
| 92 |
+
\
|
| 93 |
+
/* implementation-specific states (512..) */
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
#define FOREACH_KMP_MUTEX_IMPL(macro) \
|
| 97 |
+
macro (kmp_mutex_impl_none, 0) /* unknown implementation */ \
|
| 98 |
+
macro (kmp_mutex_impl_spin, 1) /* based on spin */ \
|
| 99 |
+
macro (kmp_mutex_impl_queuing, 2) /* based on some fair policy */ \
|
| 100 |
+
macro (kmp_mutex_impl_speculative, 3) /* based on HW-supported speculation */
|
| 101 |
+
|
| 102 |
+
#define FOREACH_OMPT_EVENT(macro) \
|
| 103 |
+
\
|
| 104 |
+
/*--- Mandatory Events ---*/ \
|
| 105 |
+
macro (ompt_callback_thread_begin, ompt_callback_thread_begin_t, 1) /* thread begin */ \
|
| 106 |
+
macro (ompt_callback_thread_end, ompt_callback_thread_end_t, 2) /* thread end */ \
|
| 107 |
+
\
|
| 108 |
+
macro (ompt_callback_parallel_begin, ompt_callback_parallel_begin_t, 3) /* parallel begin */ \
|
| 109 |
+
macro (ompt_callback_parallel_end, ompt_callback_parallel_end_t, 4) /* parallel end */ \
|
| 110 |
+
\
|
| 111 |
+
macro (ompt_callback_task_create, ompt_callback_task_create_t, 5) /* task begin */ \
|
| 112 |
+
macro (ompt_callback_task_schedule, ompt_callback_task_schedule_t, 6) /* task schedule */ \
|
| 113 |
+
macro (ompt_callback_implicit_task, ompt_callback_implicit_task_t, 7) /* implicit task */ \
|
| 114 |
+
\
|
| 115 |
+
macro (ompt_callback_target, ompt_callback_target_t, 8) /* target */ \
|
| 116 |
+
macro (ompt_callback_target_data_op, ompt_callback_target_data_op_t, 9) /* target data op */ \
|
| 117 |
+
macro (ompt_callback_target_submit, ompt_callback_target_submit_t, 10) /* target submit */ \
|
| 118 |
+
\
|
| 119 |
+
macro (ompt_callback_control_tool, ompt_callback_control_tool_t, 11) /* control tool */ \
|
| 120 |
+
\
|
| 121 |
+
macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize */ \
|
| 122 |
+
macro (ompt_callback_device_finalize, ompt_callback_device_finalize_t, 13) /* device finalize */ \
|
| 123 |
+
\
|
| 124 |
+
macro (ompt_callback_device_load, ompt_callback_device_load_t, 14) /* device load */ \
|
| 125 |
+
macro (ompt_callback_device_unload, ompt_callback_device_unload_t, 15) /* device unload */ \
|
| 126 |
+
\
|
| 127 |
+
/* Optional Events */ \
|
| 128 |
+
macro (ompt_callback_sync_region_wait, ompt_callback_sync_region_t, 16) /* sync region wait begin or end */ \
|
| 129 |
+
\
|
| 130 |
+
macro (ompt_callback_mutex_released, ompt_callback_mutex_t, 17) /* mutex released */ \
|
| 131 |
+
\
|
| 132 |
+
macro (ompt_callback_dependences, ompt_callback_dependences_t, 18) /* report task dependences */ \
|
| 133 |
+
macro (ompt_callback_task_dependence, ompt_callback_task_dependence_t, 19) /* report task dependence */ \
|
| 134 |
+
\
|
| 135 |
+
macro (ompt_callback_work, ompt_callback_work_t, 20) /* task at work begin or end */ \
|
| 136 |
+
\
|
| 137 |
+
macro (ompt_callback_master, ompt_callback_master_t, 21) /* task at master begin or end */ \
|
| 138 |
+
\
|
| 139 |
+
macro (ompt_callback_target_map, ompt_callback_target_map_t, 22) /* target map */ \
|
| 140 |
+
\
|
| 141 |
+
macro (ompt_callback_sync_region, ompt_callback_sync_region_t, 23) /* sync region begin or end */ \
|
| 142 |
+
\
|
| 143 |
+
macro (ompt_callback_lock_init, ompt_callback_mutex_acquire_t, 24) /* lock init */ \
|
| 144 |
+
macro (ompt_callback_lock_destroy, ompt_callback_mutex_t, 25) /* lock destroy */ \
|
| 145 |
+
\
|
| 146 |
+
macro (ompt_callback_mutex_acquire, ompt_callback_mutex_acquire_t, 26) /* mutex acquire */ \
|
| 147 |
+
macro (ompt_callback_mutex_acquired, ompt_callback_mutex_t, 27) /* mutex acquired */ \
|
| 148 |
+
\
|
| 149 |
+
macro (ompt_callback_nest_lock, ompt_callback_nest_lock_t, 28) /* nest lock */ \
|
| 150 |
+
\
|
| 151 |
+
macro (ompt_callback_flush, ompt_callback_flush_t, 29) /* after executing flush */ \
|
| 152 |
+
\
|
| 153 |
+
macro (ompt_callback_cancel, ompt_callback_cancel_t, 30) /* cancel innermost binding region */ \
|
| 154 |
+
\
|
| 155 |
+
macro (ompt_callback_reduction, ompt_callback_sync_region_t, 31) /* reduction */ \
|
| 156 |
+
\
|
| 157 |
+
macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */
|
| 158 |
+
|
| 159 |
+
/*****************************************************************************
|
| 160 |
+
* implementation specific types
|
| 161 |
+
*****************************************************************************/
|
| 162 |
+
|
| 163 |
+
typedef enum kmp_mutex_impl_t {
|
| 164 |
+
#define kmp_mutex_impl_macro(impl, code) impl = code,
|
| 165 |
+
FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
|
| 166 |
+
#undef kmp_mutex_impl_macro
|
| 167 |
+
} kmp_mutex_impl_t;
|
| 168 |
+
|
| 169 |
+
/*****************************************************************************
|
| 170 |
+
* definitions generated from spec
|
| 171 |
+
*****************************************************************************/
|
| 172 |
+
|
| 173 |
+
typedef enum ompt_callbacks_t {
|
| 174 |
+
ompt_callback_thread_begin = 1,
|
| 175 |
+
ompt_callback_thread_end = 2,
|
| 176 |
+
ompt_callback_parallel_begin = 3,
|
| 177 |
+
ompt_callback_parallel_end = 4,
|
| 178 |
+
ompt_callback_task_create = 5,
|
| 179 |
+
ompt_callback_task_schedule = 6,
|
| 180 |
+
ompt_callback_implicit_task = 7,
|
| 181 |
+
ompt_callback_target = 8,
|
| 182 |
+
ompt_callback_target_data_op = 9,
|
| 183 |
+
ompt_callback_target_submit = 10,
|
| 184 |
+
ompt_callback_control_tool = 11,
|
| 185 |
+
ompt_callback_device_initialize = 12,
|
| 186 |
+
ompt_callback_device_finalize = 13,
|
| 187 |
+
ompt_callback_device_load = 14,
|
| 188 |
+
ompt_callback_device_unload = 15,
|
| 189 |
+
ompt_callback_sync_region_wait = 16,
|
| 190 |
+
ompt_callback_mutex_released = 17,
|
| 191 |
+
ompt_callback_dependences = 18,
|
| 192 |
+
ompt_callback_task_dependence = 19,
|
| 193 |
+
ompt_callback_work = 20,
|
| 194 |
+
ompt_callback_master = 21,
|
| 195 |
+
ompt_callback_target_map = 22,
|
| 196 |
+
ompt_callback_sync_region = 23,
|
| 197 |
+
ompt_callback_lock_init = 24,
|
| 198 |
+
ompt_callback_lock_destroy = 25,
|
| 199 |
+
ompt_callback_mutex_acquire = 26,
|
| 200 |
+
ompt_callback_mutex_acquired = 27,
|
| 201 |
+
ompt_callback_nest_lock = 28,
|
| 202 |
+
ompt_callback_flush = 29,
|
| 203 |
+
ompt_callback_cancel = 30,
|
| 204 |
+
ompt_callback_reduction = 31,
|
| 205 |
+
ompt_callback_dispatch = 32
|
| 206 |
+
} ompt_callbacks_t;
|
| 207 |
+
|
| 208 |
+
typedef enum ompt_record_t {
|
| 209 |
+
ompt_record_ompt = 1,
|
| 210 |
+
ompt_record_native = 2,
|
| 211 |
+
ompt_record_invalid = 3
|
| 212 |
+
} ompt_record_t;
|
| 213 |
+
|
| 214 |
+
typedef enum ompt_record_native_t {
|
| 215 |
+
ompt_record_native_info = 1,
|
| 216 |
+
ompt_record_native_event = 2
|
| 217 |
+
} ompt_record_native_t;
|
| 218 |
+
|
| 219 |
+
typedef enum ompt_set_result_t {
|
| 220 |
+
ompt_set_error = 0,
|
| 221 |
+
ompt_set_never = 1,
|
| 222 |
+
ompt_set_impossible = 2,
|
| 223 |
+
ompt_set_sometimes = 3,
|
| 224 |
+
ompt_set_sometimes_paired = 4,
|
| 225 |
+
ompt_set_always = 5
|
| 226 |
+
} ompt_set_result_t;
|
| 227 |
+
|
| 228 |
+
typedef uint64_t ompt_id_t;
|
| 229 |
+
|
| 230 |
+
typedef uint64_t ompt_device_time_t;
|
| 231 |
+
|
| 232 |
+
typedef uint64_t ompt_buffer_cursor_t;
|
| 233 |
+
|
| 234 |
+
typedef enum ompt_thread_t {
|
| 235 |
+
ompt_thread_initial = 1,
|
| 236 |
+
ompt_thread_worker = 2,
|
| 237 |
+
ompt_thread_other = 3,
|
| 238 |
+
ompt_thread_unknown = 4
|
| 239 |
+
} ompt_thread_t;
|
| 240 |
+
|
| 241 |
+
typedef enum ompt_scope_endpoint_t {
|
| 242 |
+
ompt_scope_begin = 1,
|
| 243 |
+
ompt_scope_end = 2
|
| 244 |
+
} ompt_scope_endpoint_t;
|
| 245 |
+
|
| 246 |
+
typedef enum ompt_dispatch_t {
|
| 247 |
+
ompt_dispatch_iteration = 1,
|
| 248 |
+
ompt_dispatch_section = 2
|
| 249 |
+
} ompt_dispatch_t;
|
| 250 |
+
|
| 251 |
+
typedef enum ompt_sync_region_t {
|
| 252 |
+
ompt_sync_region_barrier = 1,
|
| 253 |
+
ompt_sync_region_barrier_implicit = 2,
|
| 254 |
+
ompt_sync_region_barrier_explicit = 3,
|
| 255 |
+
ompt_sync_region_barrier_implementation = 4,
|
| 256 |
+
ompt_sync_region_taskwait = 5,
|
| 257 |
+
ompt_sync_region_taskgroup = 6,
|
| 258 |
+
ompt_sync_region_reduction = 7
|
| 259 |
+
} ompt_sync_region_t;
|
| 260 |
+
|
| 261 |
+
typedef enum ompt_target_data_op_t {
|
| 262 |
+
ompt_target_data_alloc = 1,
|
| 263 |
+
ompt_target_data_transfer_to_device = 2,
|
| 264 |
+
ompt_target_data_transfer_from_device = 3,
|
| 265 |
+
ompt_target_data_delete = 4,
|
| 266 |
+
ompt_target_data_associate = 5,
|
| 267 |
+
ompt_target_data_disassociate = 6
|
| 268 |
+
} ompt_target_data_op_t;
|
| 269 |
+
|
| 270 |
+
typedef enum ompt_work_t {
|
| 271 |
+
ompt_work_loop = 1,
|
| 272 |
+
ompt_work_sections = 2,
|
| 273 |
+
ompt_work_single_executor = 3,
|
| 274 |
+
ompt_work_single_other = 4,
|
| 275 |
+
ompt_work_workshare = 5,
|
| 276 |
+
ompt_work_distribute = 6,
|
| 277 |
+
ompt_work_taskloop = 7
|
| 278 |
+
} ompt_work_t;
|
| 279 |
+
|
| 280 |
+
typedef enum ompt_mutex_t {
|
| 281 |
+
ompt_mutex_lock = 1,
|
| 282 |
+
ompt_mutex_test_lock = 2,
|
| 283 |
+
ompt_mutex_nest_lock = 3,
|
| 284 |
+
ompt_mutex_test_nest_lock = 4,
|
| 285 |
+
ompt_mutex_critical = 5,
|
| 286 |
+
ompt_mutex_atomic = 6,
|
| 287 |
+
ompt_mutex_ordered = 7
|
| 288 |
+
} ompt_mutex_t;
|
| 289 |
+
|
| 290 |
+
typedef enum ompt_native_mon_flag_t {
|
| 291 |
+
ompt_native_data_motion_explicit = 0x01,
|
| 292 |
+
ompt_native_data_motion_implicit = 0x02,
|
| 293 |
+
ompt_native_kernel_invocation = 0x04,
|
| 294 |
+
ompt_native_kernel_execution = 0x08,
|
| 295 |
+
ompt_native_driver = 0x10,
|
| 296 |
+
ompt_native_runtime = 0x20,
|
| 297 |
+
ompt_native_overhead = 0x40,
|
| 298 |
+
ompt_native_idleness = 0x80
|
| 299 |
+
} ompt_native_mon_flag_t;
|
| 300 |
+
|
| 301 |
+
typedef enum ompt_task_flag_t {
|
| 302 |
+
ompt_task_initial = 0x00000001,
|
| 303 |
+
ompt_task_implicit = 0x00000002,
|
| 304 |
+
ompt_task_explicit = 0x00000004,
|
| 305 |
+
ompt_task_target = 0x00000008,
|
| 306 |
+
ompt_task_undeferred = 0x08000000,
|
| 307 |
+
ompt_task_untied = 0x10000000,
|
| 308 |
+
ompt_task_final = 0x20000000,
|
| 309 |
+
ompt_task_mergeable = 0x40000000,
|
| 310 |
+
ompt_task_merged = 0x80000000
|
| 311 |
+
} ompt_task_flag_t;
|
| 312 |
+
|
| 313 |
+
typedef enum ompt_task_status_t {
|
| 314 |
+
ompt_task_complete = 1,
|
| 315 |
+
ompt_task_yield = 2,
|
| 316 |
+
ompt_task_cancel = 3,
|
| 317 |
+
ompt_task_detach = 4,
|
| 318 |
+
ompt_task_early_fulfill = 5,
|
| 319 |
+
ompt_task_late_fulfill = 6,
|
| 320 |
+
ompt_task_switch = 7
|
| 321 |
+
} ompt_task_status_t;
|
| 322 |
+
|
| 323 |
+
typedef enum ompt_target_t {
|
| 324 |
+
ompt_target = 1,
|
| 325 |
+
ompt_target_enter_data = 2,
|
| 326 |
+
ompt_target_exit_data = 3,
|
| 327 |
+
ompt_target_update = 4
|
| 328 |
+
} ompt_target_t;
|
| 329 |
+
|
| 330 |
+
typedef enum ompt_parallel_flag_t {
|
| 331 |
+
ompt_parallel_invoker_program = 0x00000001,
|
| 332 |
+
ompt_parallel_invoker_runtime = 0x00000002,
|
| 333 |
+
ompt_parallel_league = 0x40000000,
|
| 334 |
+
ompt_parallel_team = 0x80000000
|
| 335 |
+
} ompt_parallel_flag_t;
|
| 336 |
+
|
| 337 |
+
typedef enum ompt_target_map_flag_t {
|
| 338 |
+
ompt_target_map_flag_to = 0x01,
|
| 339 |
+
ompt_target_map_flag_from = 0x02,
|
| 340 |
+
ompt_target_map_flag_alloc = 0x04,
|
| 341 |
+
ompt_target_map_flag_release = 0x08,
|
| 342 |
+
ompt_target_map_flag_delete = 0x10,
|
| 343 |
+
ompt_target_map_flag_implicit = 0x20
|
| 344 |
+
} ompt_target_map_flag_t;
|
| 345 |
+
|
| 346 |
+
typedef enum ompt_dependence_type_t {
|
| 347 |
+
ompt_dependence_type_in = 1,
|
| 348 |
+
ompt_dependence_type_out = 2,
|
| 349 |
+
ompt_dependence_type_inout = 3,
|
| 350 |
+
ompt_dependence_type_mutexinoutset = 4,
|
| 351 |
+
ompt_dependence_type_source = 5,
|
| 352 |
+
ompt_dependence_type_sink = 6
|
| 353 |
+
} ompt_dependence_type_t;
|
| 354 |
+
|
| 355 |
+
typedef enum ompt_cancel_flag_t {
|
| 356 |
+
ompt_cancel_parallel = 0x01,
|
| 357 |
+
ompt_cancel_sections = 0x02,
|
| 358 |
+
ompt_cancel_loop = 0x04,
|
| 359 |
+
ompt_cancel_taskgroup = 0x08,
|
| 360 |
+
ompt_cancel_activated = 0x10,
|
| 361 |
+
ompt_cancel_detected = 0x20,
|
| 362 |
+
ompt_cancel_discarded_task = 0x40
|
| 363 |
+
} ompt_cancel_flag_t;
|
| 364 |
+
|
| 365 |
+
typedef uint64_t ompt_hwid_t;
|
| 366 |
+
|
| 367 |
+
typedef uint64_t ompt_wait_id_t;
|
| 368 |
+
|
| 369 |
+
typedef enum ompt_frame_flag_t {
|
| 370 |
+
ompt_frame_runtime = 0x00,
|
| 371 |
+
ompt_frame_application = 0x01,
|
| 372 |
+
ompt_frame_cfa = 0x10,
|
| 373 |
+
ompt_frame_framepointer = 0x20,
|
| 374 |
+
ompt_frame_stackaddress = 0x30
|
| 375 |
+
} ompt_frame_flag_t;
|
| 376 |
+
|
| 377 |
+
typedef enum ompt_state_t {
|
| 378 |
+
ompt_state_work_serial = 0x000,
|
| 379 |
+
ompt_state_work_parallel = 0x001,
|
| 380 |
+
ompt_state_work_reduction = 0x002,
|
| 381 |
+
|
| 382 |
+
ompt_state_wait_barrier = 0x010,
|
| 383 |
+
ompt_state_wait_barrier_implicit_parallel = 0x011,
|
| 384 |
+
ompt_state_wait_barrier_implicit_workshare = 0x012,
|
| 385 |
+
ompt_state_wait_barrier_implicit = 0x013,
|
| 386 |
+
ompt_state_wait_barrier_explicit = 0x014,
|
| 387 |
+
|
| 388 |
+
ompt_state_wait_taskwait = 0x020,
|
| 389 |
+
ompt_state_wait_taskgroup = 0x021,
|
| 390 |
+
|
| 391 |
+
ompt_state_wait_mutex = 0x040,
|
| 392 |
+
ompt_state_wait_lock = 0x041,
|
| 393 |
+
ompt_state_wait_critical = 0x042,
|
| 394 |
+
ompt_state_wait_atomic = 0x043,
|
| 395 |
+
ompt_state_wait_ordered = 0x044,
|
| 396 |
+
|
| 397 |
+
ompt_state_wait_target = 0x080,
|
| 398 |
+
ompt_state_wait_target_map = 0x081,
|
| 399 |
+
ompt_state_wait_target_update = 0x082,
|
| 400 |
+
|
| 401 |
+
ompt_state_idle = 0x100,
|
| 402 |
+
ompt_state_overhead = 0x101,
|
| 403 |
+
ompt_state_undefined = 0x102
|
| 404 |
+
} ompt_state_t;
|
| 405 |
+
|
| 406 |
+
typedef uint64_t (*ompt_get_unique_id_t) (void);
|
| 407 |
+
|
| 408 |
+
typedef uint64_t ompd_size_t;
|
| 409 |
+
|
| 410 |
+
typedef uint64_t ompd_wait_id_t;
|
| 411 |
+
|
| 412 |
+
typedef uint64_t ompd_addr_t;
|
| 413 |
+
typedef int64_t ompd_word_t;
|
| 414 |
+
typedef uint64_t ompd_seg_t;
|
| 415 |
+
|
| 416 |
+
typedef uint64_t ompd_device_t;
|
| 417 |
+
|
| 418 |
+
typedef uint64_t ompd_thread_id_t;
|
| 419 |
+
|
| 420 |
+
typedef enum ompd_scope_t {
|
| 421 |
+
ompd_scope_global = 1,
|
| 422 |
+
ompd_scope_address_space = 2,
|
| 423 |
+
ompd_scope_thread = 3,
|
| 424 |
+
ompd_scope_parallel = 4,
|
| 425 |
+
ompd_scope_implicit_task = 5,
|
| 426 |
+
ompd_scope_task = 6
|
| 427 |
+
} ompd_scope_t;
|
| 428 |
+
|
| 429 |
+
typedef uint64_t ompd_icv_id_t;
|
| 430 |
+
|
| 431 |
+
typedef enum ompd_rc_t {
|
| 432 |
+
ompd_rc_ok = 0,
|
| 433 |
+
ompd_rc_unavailable = 1,
|
| 434 |
+
ompd_rc_stale_handle = 2,
|
| 435 |
+
ompd_rc_bad_input = 3,
|
| 436 |
+
ompd_rc_error = 4,
|
| 437 |
+
ompd_rc_unsupported = 5,
|
| 438 |
+
ompd_rc_needs_state_tracking = 6,
|
| 439 |
+
ompd_rc_incompatible = 7,
|
| 440 |
+
ompd_rc_device_read_error = 8,
|
| 441 |
+
ompd_rc_device_write_error = 9,
|
| 442 |
+
ompd_rc_nomem = 10,
|
| 443 |
+
} ompd_rc_t;
|
| 444 |
+
|
| 445 |
+
typedef void (*ompt_interface_fn_t) (void);
|
| 446 |
+
|
| 447 |
+
typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
|
| 448 |
+
const char *interface_function_name
|
| 449 |
+
);
|
| 450 |
+
|
| 451 |
+
typedef union ompt_data_t {
|
| 452 |
+
uint64_t value;
|
| 453 |
+
void *ptr;
|
| 454 |
+
} ompt_data_t;
|
| 455 |
+
|
| 456 |
+
typedef struct ompt_frame_t {
|
| 457 |
+
ompt_data_t exit_frame;
|
| 458 |
+
ompt_data_t enter_frame;
|
| 459 |
+
int exit_frame_flags;
|
| 460 |
+
int enter_frame_flags;
|
| 461 |
+
} ompt_frame_t;
|
| 462 |
+
|
| 463 |
+
typedef void (*ompt_callback_t) (void);
|
| 464 |
+
|
| 465 |
+
typedef void ompt_device_t;
|
| 466 |
+
|
| 467 |
+
typedef void ompt_buffer_t;
|
| 468 |
+
|
| 469 |
+
typedef void (*ompt_callback_buffer_request_t) (
|
| 470 |
+
int device_num,
|
| 471 |
+
ompt_buffer_t **buffer,
|
| 472 |
+
size_t *bytes
|
| 473 |
+
);
|
| 474 |
+
|
| 475 |
+
typedef void (*ompt_callback_buffer_complete_t) (
|
| 476 |
+
int device_num,
|
| 477 |
+
ompt_buffer_t *buffer,
|
| 478 |
+
size_t bytes,
|
| 479 |
+
ompt_buffer_cursor_t begin,
|
| 480 |
+
int buffer_owned
|
| 481 |
+
);
|
| 482 |
+
|
| 483 |
+
typedef void (*ompt_finalize_t) (
|
| 484 |
+
ompt_data_t *tool_data
|
| 485 |
+
);
|
| 486 |
+
|
| 487 |
+
typedef int (*ompt_initialize_t) (
|
| 488 |
+
ompt_function_lookup_t lookup,
|
| 489 |
+
int initial_device_num,
|
| 490 |
+
ompt_data_t *tool_data
|
| 491 |
+
);
|
| 492 |
+
|
| 493 |
+
typedef struct ompt_start_tool_result_t {
|
| 494 |
+
ompt_initialize_t initialize;
|
| 495 |
+
ompt_finalize_t finalize;
|
| 496 |
+
ompt_data_t tool_data;
|
| 497 |
+
} ompt_start_tool_result_t;
|
| 498 |
+
|
| 499 |
+
typedef struct ompt_record_abstract_t {
|
| 500 |
+
ompt_record_native_t rclass;
|
| 501 |
+
const char *type;
|
| 502 |
+
ompt_device_time_t start_time;
|
| 503 |
+
ompt_device_time_t end_time;
|
| 504 |
+
ompt_hwid_t hwid;
|
| 505 |
+
} ompt_record_abstract_t;
|
| 506 |
+
|
| 507 |
+
typedef struct ompt_dependence_t {
|
| 508 |
+
ompt_data_t variable;
|
| 509 |
+
ompt_dependence_type_t dependence_type;
|
| 510 |
+
} ompt_dependence_t;
|
| 511 |
+
|
| 512 |
+
typedef int (*ompt_enumerate_states_t) (
|
| 513 |
+
int current_state,
|
| 514 |
+
int *next_state,
|
| 515 |
+
const char **next_state_name
|
| 516 |
+
);
|
| 517 |
+
|
| 518 |
+
typedef int (*ompt_enumerate_mutex_impls_t) (
|
| 519 |
+
int current_impl,
|
| 520 |
+
int *next_impl,
|
| 521 |
+
const char **next_impl_name
|
| 522 |
+
);
|
| 523 |
+
|
| 524 |
+
typedef ompt_set_result_t (*ompt_set_callback_t) (
|
| 525 |
+
ompt_callbacks_t event,
|
| 526 |
+
ompt_callback_t callback
|
| 527 |
+
);
|
| 528 |
+
|
| 529 |
+
typedef int (*ompt_get_callback_t) (
|
| 530 |
+
ompt_callbacks_t event,
|
| 531 |
+
ompt_callback_t *callback
|
| 532 |
+
);
|
| 533 |
+
|
| 534 |
+
typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
|
| 535 |
+
|
| 536 |
+
typedef int (*ompt_get_num_procs_t) (void);
|
| 537 |
+
|
| 538 |
+
typedef int (*ompt_get_num_places_t) (void);
|
| 539 |
+
|
| 540 |
+
typedef int (*ompt_get_place_proc_ids_t) (
|
| 541 |
+
int place_num,
|
| 542 |
+
int ids_size,
|
| 543 |
+
int *ids
|
| 544 |
+
);
|
| 545 |
+
|
| 546 |
+
typedef int (*ompt_get_place_num_t) (void);
|
| 547 |
+
|
| 548 |
+
typedef int (*ompt_get_partition_place_nums_t) (
|
| 549 |
+
int place_nums_size,
|
| 550 |
+
int *place_nums
|
| 551 |
+
);
|
| 552 |
+
|
| 553 |
+
typedef int (*ompt_get_proc_id_t) (void);
|
| 554 |
+
|
| 555 |
+
typedef int (*ompt_get_state_t) (
|
| 556 |
+
ompt_wait_id_t *wait_id
|
| 557 |
+
);
|
| 558 |
+
|
| 559 |
+
typedef int (*ompt_get_parallel_info_t) (
|
| 560 |
+
int ancestor_level,
|
| 561 |
+
ompt_data_t **parallel_data,
|
| 562 |
+
int *team_size
|
| 563 |
+
);
|
| 564 |
+
|
| 565 |
+
typedef int (*ompt_get_task_info_t) (
|
| 566 |
+
int ancestor_level,
|
| 567 |
+
int *flags,
|
| 568 |
+
ompt_data_t **task_data,
|
| 569 |
+
ompt_frame_t **task_frame,
|
| 570 |
+
ompt_data_t **parallel_data,
|
| 571 |
+
int *thread_num
|
| 572 |
+
);
|
| 573 |
+
|
| 574 |
+
typedef int (*ompt_get_task_memory_t)(
|
| 575 |
+
void **addr,
|
| 576 |
+
size_t *size,
|
| 577 |
+
int block
|
| 578 |
+
);
|
| 579 |
+
|
| 580 |
+
typedef int (*ompt_get_target_info_t) (
|
| 581 |
+
uint64_t *device_num,
|
| 582 |
+
ompt_id_t *target_id,
|
| 583 |
+
ompt_id_t *host_op_id
|
| 584 |
+
);
|
| 585 |
+
|
| 586 |
+
typedef int (*ompt_get_num_devices_t) (void);
|
| 587 |
+
|
| 588 |
+
typedef void (*ompt_finalize_tool_t) (void);
|
| 589 |
+
|
| 590 |
+
typedef int (*ompt_get_device_num_procs_t) (
|
| 591 |
+
ompt_device_t *device
|
| 592 |
+
);
|
| 593 |
+
|
| 594 |
+
typedef ompt_device_time_t (*ompt_get_device_time_t) (
|
| 595 |
+
ompt_device_t *device
|
| 596 |
+
);
|
| 597 |
+
|
| 598 |
+
typedef double (*ompt_translate_time_t) (
|
| 599 |
+
ompt_device_t *device,
|
| 600 |
+
ompt_device_time_t time
|
| 601 |
+
);
|
| 602 |
+
|
| 603 |
+
typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
|
| 604 |
+
ompt_device_t *device,
|
| 605 |
+
unsigned int enable,
|
| 606 |
+
unsigned int etype
|
| 607 |
+
);
|
| 608 |
+
|
| 609 |
+
typedef ompt_set_result_t (*ompt_set_trace_native_t) (
|
| 610 |
+
ompt_device_t *device,
|
| 611 |
+
int enable,
|
| 612 |
+
int flags
|
| 613 |
+
);
|
| 614 |
+
|
| 615 |
+
typedef int (*ompt_start_trace_t) (
|
| 616 |
+
ompt_device_t *device,
|
| 617 |
+
ompt_callback_buffer_request_t request,
|
| 618 |
+
ompt_callback_buffer_complete_t complete
|
| 619 |
+
);
|
| 620 |
+
|
| 621 |
+
typedef int (*ompt_pause_trace_t) (
|
| 622 |
+
ompt_device_t *device,
|
| 623 |
+
int begin_pause
|
| 624 |
+
);
|
| 625 |
+
|
| 626 |
+
typedef int (*ompt_flush_trace_t) (
|
| 627 |
+
ompt_device_t *device
|
| 628 |
+
);
|
| 629 |
+
|
| 630 |
+
typedef int (*ompt_stop_trace_t) (
|
| 631 |
+
ompt_device_t *device
|
| 632 |
+
);
|
| 633 |
+
|
| 634 |
+
typedef int (*ompt_advance_buffer_cursor_t) (
|
| 635 |
+
ompt_device_t *device,
|
| 636 |
+
ompt_buffer_t *buffer,
|
| 637 |
+
size_t size,
|
| 638 |
+
ompt_buffer_cursor_t current,
|
| 639 |
+
ompt_buffer_cursor_t *next
|
| 640 |
+
);
|
| 641 |
+
|
| 642 |
+
typedef ompt_record_t (*ompt_get_record_type_t) (
|
| 643 |
+
ompt_buffer_t *buffer,
|
| 644 |
+
ompt_buffer_cursor_t current
|
| 645 |
+
);
|
| 646 |
+
|
| 647 |
+
typedef void *(*ompt_get_record_native_t) (
|
| 648 |
+
ompt_buffer_t *buffer,
|
| 649 |
+
ompt_buffer_cursor_t current,
|
| 650 |
+
ompt_id_t *host_op_id
|
| 651 |
+
);
|
| 652 |
+
|
| 653 |
+
typedef ompt_record_abstract_t *
|
| 654 |
+
(*ompt_get_record_abstract_t) (
|
| 655 |
+
void *native_record
|
| 656 |
+
);
|
| 657 |
+
|
| 658 |
+
typedef void (*ompt_callback_thread_begin_t) (
|
| 659 |
+
ompt_thread_t thread_type,
|
| 660 |
+
ompt_data_t *thread_data
|
| 661 |
+
);
|
| 662 |
+
|
| 663 |
+
typedef struct ompt_record_thread_begin_t {
|
| 664 |
+
ompt_thread_t thread_type;
|
| 665 |
+
} ompt_record_thread_begin_t;
|
| 666 |
+
|
| 667 |
+
typedef void (*ompt_callback_thread_end_t) (
|
| 668 |
+
ompt_data_t *thread_data
|
| 669 |
+
);
|
| 670 |
+
|
| 671 |
+
typedef void (*ompt_callback_parallel_begin_t) (
|
| 672 |
+
ompt_data_t *encountering_task_data,
|
| 673 |
+
const ompt_frame_t *encountering_task_frame,
|
| 674 |
+
ompt_data_t *parallel_data,
|
| 675 |
+
unsigned int requested_parallelism,
|
| 676 |
+
int flags,
|
| 677 |
+
const void *codeptr_ra
|
| 678 |
+
);
|
| 679 |
+
|
| 680 |
+
typedef struct ompt_record_parallel_begin_t {
|
| 681 |
+
ompt_id_t encountering_task_id;
|
| 682 |
+
ompt_id_t parallel_id;
|
| 683 |
+
unsigned int requested_parallelism;
|
| 684 |
+
int flags;
|
| 685 |
+
const void *codeptr_ra;
|
| 686 |
+
} ompt_record_parallel_begin_t;
|
| 687 |
+
|
| 688 |
+
typedef void (*ompt_callback_parallel_end_t) (
|
| 689 |
+
ompt_data_t *parallel_data,
|
| 690 |
+
ompt_data_t *encountering_task_data,
|
| 691 |
+
int flags,
|
| 692 |
+
const void *codeptr_ra
|
| 693 |
+
);
|
| 694 |
+
|
| 695 |
+
typedef struct ompt_record_parallel_end_t {
|
| 696 |
+
ompt_id_t parallel_id;
|
| 697 |
+
ompt_id_t encountering_task_id;
|
| 698 |
+
int flags;
|
| 699 |
+
const void *codeptr_ra;
|
| 700 |
+
} ompt_record_parallel_end_t;
|
| 701 |
+
|
| 702 |
+
typedef void (*ompt_callback_work_t) (
|
| 703 |
+
ompt_work_t wstype,
|
| 704 |
+
ompt_scope_endpoint_t endpoint,
|
| 705 |
+
ompt_data_t *parallel_data,
|
| 706 |
+
ompt_data_t *task_data,
|
| 707 |
+
uint64_t count,
|
| 708 |
+
const void *codeptr_ra
|
| 709 |
+
);
|
| 710 |
+
|
| 711 |
+
typedef struct ompt_record_work_t {
|
| 712 |
+
ompt_work_t wstype;
|
| 713 |
+
ompt_scope_endpoint_t endpoint;
|
| 714 |
+
ompt_id_t parallel_id;
|
| 715 |
+
ompt_id_t task_id;
|
| 716 |
+
uint64_t count;
|
| 717 |
+
const void *codeptr_ra;
|
| 718 |
+
} ompt_record_work_t;
|
| 719 |
+
|
| 720 |
+
typedef void (*ompt_callback_dispatch_t) (
|
| 721 |
+
ompt_data_t *parallel_data,
|
| 722 |
+
ompt_data_t *task_data,
|
| 723 |
+
ompt_dispatch_t kind,
|
| 724 |
+
ompt_data_t instance
|
| 725 |
+
);
|
| 726 |
+
|
| 727 |
+
typedef struct ompt_record_dispatch_t {
|
| 728 |
+
ompt_id_t parallel_id;
|
| 729 |
+
ompt_id_t task_id;
|
| 730 |
+
ompt_dispatch_t kind;
|
| 731 |
+
ompt_data_t instance;
|
| 732 |
+
} ompt_record_dispatch_t;
|
| 733 |
+
|
| 734 |
+
typedef void (*ompt_callback_task_create_t) (
|
| 735 |
+
ompt_data_t *encountering_task_data,
|
| 736 |
+
const ompt_frame_t *encountering_task_frame,
|
| 737 |
+
ompt_data_t *new_task_data,
|
| 738 |
+
int flags,
|
| 739 |
+
int has_dependences,
|
| 740 |
+
const void *codeptr_ra
|
| 741 |
+
);
|
| 742 |
+
|
| 743 |
+
typedef struct ompt_record_task_create_t {
|
| 744 |
+
ompt_id_t encountering_task_id;
|
| 745 |
+
ompt_id_t new_task_id;
|
| 746 |
+
int flags;
|
| 747 |
+
int has_dependences;
|
| 748 |
+
const void *codeptr_ra;
|
| 749 |
+
} ompt_record_task_create_t;
|
| 750 |
+
|
| 751 |
+
typedef void (*ompt_callback_dependences_t) (
|
| 752 |
+
ompt_data_t *task_data,
|
| 753 |
+
const ompt_dependence_t *deps,
|
| 754 |
+
int ndeps
|
| 755 |
+
);
|
| 756 |
+
|
| 757 |
+
typedef struct ompt_record_dependences_t {
|
| 758 |
+
ompt_id_t task_id;
|
| 759 |
+
ompt_dependence_t dep;
|
| 760 |
+
int ndeps;
|
| 761 |
+
} ompt_record_dependences_t;
|
| 762 |
+
|
| 763 |
+
typedef void (*ompt_callback_task_dependence_t) (
|
| 764 |
+
ompt_data_t *src_task_data,
|
| 765 |
+
ompt_data_t *sink_task_data
|
| 766 |
+
);
|
| 767 |
+
|
| 768 |
+
typedef struct ompt_record_task_dependence_t {
|
| 769 |
+
ompt_id_t src_task_id;
|
| 770 |
+
ompt_id_t sink_task_id;
|
| 771 |
+
} ompt_record_task_dependence_t;
|
| 772 |
+
|
| 773 |
+
typedef void (*ompt_callback_task_schedule_t) (
|
| 774 |
+
ompt_data_t *prior_task_data,
|
| 775 |
+
ompt_task_status_t prior_task_status,
|
| 776 |
+
ompt_data_t *next_task_data
|
| 777 |
+
);
|
| 778 |
+
|
| 779 |
+
typedef struct ompt_record_task_schedule_t {
|
| 780 |
+
ompt_id_t prior_task_id;
|
| 781 |
+
ompt_task_status_t prior_task_status;
|
| 782 |
+
ompt_id_t next_task_id;
|
| 783 |
+
} ompt_record_task_schedule_t;
|
| 784 |
+
|
| 785 |
+
typedef void (*ompt_callback_implicit_task_t) (
|
| 786 |
+
ompt_scope_endpoint_t endpoint,
|
| 787 |
+
ompt_data_t *parallel_data,
|
| 788 |
+
ompt_data_t *task_data,
|
| 789 |
+
unsigned int actual_parallelism,
|
| 790 |
+
unsigned int index,
|
| 791 |
+
int flags
|
| 792 |
+
);
|
| 793 |
+
|
| 794 |
+
typedef struct ompt_record_implicit_task_t {
|
| 795 |
+
ompt_scope_endpoint_t endpoint;
|
| 796 |
+
ompt_id_t parallel_id;
|
| 797 |
+
ompt_id_t task_id;
|
| 798 |
+
unsigned int actual_parallelism;
|
| 799 |
+
unsigned int index;
|
| 800 |
+
int flags;
|
| 801 |
+
} ompt_record_implicit_task_t;
|
| 802 |
+
|
| 803 |
+
typedef void (*ompt_callback_master_t) (
|
| 804 |
+
ompt_scope_endpoint_t endpoint,
|
| 805 |
+
ompt_data_t *parallel_data,
|
| 806 |
+
ompt_data_t *task_data,
|
| 807 |
+
const void *codeptr_ra
|
| 808 |
+
);
|
| 809 |
+
|
| 810 |
+
typedef struct ompt_record_master_t {
|
| 811 |
+
ompt_scope_endpoint_t endpoint;
|
| 812 |
+
ompt_id_t parallel_id;
|
| 813 |
+
ompt_id_t task_id;
|
| 814 |
+
const void *codeptr_ra;
|
| 815 |
+
} ompt_record_master_t;
|
| 816 |
+
|
| 817 |
+
typedef void (*ompt_callback_sync_region_t) (
|
| 818 |
+
ompt_sync_region_t kind,
|
| 819 |
+
ompt_scope_endpoint_t endpoint,
|
| 820 |
+
ompt_data_t *parallel_data,
|
| 821 |
+
ompt_data_t *task_data,
|
| 822 |
+
const void *codeptr_ra
|
| 823 |
+
);
|
| 824 |
+
|
| 825 |
+
typedef struct ompt_record_sync_region_t {
|
| 826 |
+
ompt_sync_region_t kind;
|
| 827 |
+
ompt_scope_endpoint_t endpoint;
|
| 828 |
+
ompt_id_t parallel_id;
|
| 829 |
+
ompt_id_t task_id;
|
| 830 |
+
const void *codeptr_ra;
|
| 831 |
+
} ompt_record_sync_region_t;
|
| 832 |
+
|
| 833 |
+
typedef void (*ompt_callback_mutex_acquire_t) (
|
| 834 |
+
ompt_mutex_t kind,
|
| 835 |
+
unsigned int hint,
|
| 836 |
+
unsigned int impl,
|
| 837 |
+
ompt_wait_id_t wait_id,
|
| 838 |
+
const void *codeptr_ra
|
| 839 |
+
);
|
| 840 |
+
|
| 841 |
+
typedef struct ompt_record_mutex_acquire_t {
|
| 842 |
+
ompt_mutex_t kind;
|
| 843 |
+
unsigned int hint;
|
| 844 |
+
unsigned int impl;
|
| 845 |
+
ompt_wait_id_t wait_id;
|
| 846 |
+
const void *codeptr_ra;
|
| 847 |
+
} ompt_record_mutex_acquire_t;
|
| 848 |
+
|
| 849 |
+
typedef void (*ompt_callback_mutex_t) (
|
| 850 |
+
ompt_mutex_t kind,
|
| 851 |
+
ompt_wait_id_t wait_id,
|
| 852 |
+
const void *codeptr_ra
|
| 853 |
+
);
|
| 854 |
+
|
| 855 |
+
typedef struct ompt_record_mutex_t {
|
| 856 |
+
ompt_mutex_t kind;
|
| 857 |
+
ompt_wait_id_t wait_id;
|
| 858 |
+
const void *codeptr_ra;
|
| 859 |
+
} ompt_record_mutex_t;
|
| 860 |
+
|
| 861 |
+
typedef void (*ompt_callback_nest_lock_t) (
|
| 862 |
+
ompt_scope_endpoint_t endpoint,
|
| 863 |
+
ompt_wait_id_t wait_id,
|
| 864 |
+
const void *codeptr_ra
|
| 865 |
+
);
|
| 866 |
+
|
| 867 |
+
typedef struct ompt_record_nest_lock_t {
|
| 868 |
+
ompt_scope_endpoint_t endpoint;
|
| 869 |
+
ompt_wait_id_t wait_id;
|
| 870 |
+
const void *codeptr_ra;
|
| 871 |
+
} ompt_record_nest_lock_t;
|
| 872 |
+
|
| 873 |
+
typedef void (*ompt_callback_flush_t) (
|
| 874 |
+
ompt_data_t *thread_data,
|
| 875 |
+
const void *codeptr_ra
|
| 876 |
+
);
|
| 877 |
+
|
| 878 |
+
typedef struct ompt_record_flush_t {
|
| 879 |
+
const void *codeptr_ra;
|
| 880 |
+
} ompt_record_flush_t;
|
| 881 |
+
|
| 882 |
+
typedef void (*ompt_callback_cancel_t) (
|
| 883 |
+
ompt_data_t *task_data,
|
| 884 |
+
int flags,
|
| 885 |
+
const void *codeptr_ra
|
| 886 |
+
);
|
| 887 |
+
|
| 888 |
+
typedef struct ompt_record_cancel_t {
|
| 889 |
+
ompt_id_t task_id;
|
| 890 |
+
int flags;
|
| 891 |
+
const void *codeptr_ra;
|
| 892 |
+
} ompt_record_cancel_t;
|
| 893 |
+
|
| 894 |
+
typedef void (*ompt_callback_device_initialize_t) (
|
| 895 |
+
int device_num,
|
| 896 |
+
const char *type,
|
| 897 |
+
ompt_device_t *device,
|
| 898 |
+
ompt_function_lookup_t lookup,
|
| 899 |
+
const char *documentation
|
| 900 |
+
);
|
| 901 |
+
|
| 902 |
+
typedef void (*ompt_callback_device_finalize_t) (
|
| 903 |
+
int device_num
|
| 904 |
+
);
|
| 905 |
+
|
| 906 |
+
typedef void (*ompt_callback_device_load_t) (
|
| 907 |
+
int device_num,
|
| 908 |
+
const char *filename,
|
| 909 |
+
int64_t offset_in_file,
|
| 910 |
+
void *vma_in_file,
|
| 911 |
+
size_t bytes,
|
| 912 |
+
void *host_addr,
|
| 913 |
+
void *device_addr,
|
| 914 |
+
uint64_t module_id
|
| 915 |
+
);
|
| 916 |
+
|
| 917 |
+
typedef void (*ompt_callback_device_unload_t) (
|
| 918 |
+
int device_num,
|
| 919 |
+
uint64_t module_id
|
| 920 |
+
);
|
| 921 |
+
|
| 922 |
+
typedef void (*ompt_callback_target_data_op_t) (
|
| 923 |
+
ompt_id_t target_id,
|
| 924 |
+
ompt_id_t host_op_id,
|
| 925 |
+
ompt_target_data_op_t optype,
|
| 926 |
+
void *src_addr,
|
| 927 |
+
int src_device_num,
|
| 928 |
+
void *dest_addr,
|
| 929 |
+
int dest_device_num,
|
| 930 |
+
size_t bytes,
|
| 931 |
+
const void *codeptr_ra
|
| 932 |
+
);
|
| 933 |
+
|
| 934 |
+
typedef struct ompt_record_target_data_op_t {
|
| 935 |
+
ompt_id_t host_op_id;
|
| 936 |
+
ompt_target_data_op_t optype;
|
| 937 |
+
void *src_addr;
|
| 938 |
+
int src_device_num;
|
| 939 |
+
void *dest_addr;
|
| 940 |
+
int dest_device_num;
|
| 941 |
+
size_t bytes;
|
| 942 |
+
ompt_device_time_t end_time;
|
| 943 |
+
const void *codeptr_ra;
|
| 944 |
+
} ompt_record_target_data_op_t;
|
| 945 |
+
|
| 946 |
+
typedef void (*ompt_callback_target_t) (
|
| 947 |
+
ompt_target_t kind,
|
| 948 |
+
ompt_scope_endpoint_t endpoint,
|
| 949 |
+
int device_num,
|
| 950 |
+
ompt_data_t *task_data,
|
| 951 |
+
ompt_id_t target_id,
|
| 952 |
+
const void *codeptr_ra
|
| 953 |
+
);
|
| 954 |
+
|
| 955 |
+
typedef struct ompt_record_target_t {
|
| 956 |
+
ompt_target_t kind;
|
| 957 |
+
ompt_scope_endpoint_t endpoint;
|
| 958 |
+
int device_num;
|
| 959 |
+
ompt_id_t task_id;
|
| 960 |
+
ompt_id_t target_id;
|
| 961 |
+
const void *codeptr_ra;
|
| 962 |
+
} ompt_record_target_t;
|
| 963 |
+
|
| 964 |
+
typedef void (*ompt_callback_target_map_t) (
|
| 965 |
+
ompt_id_t target_id,
|
| 966 |
+
unsigned int nitems,
|
| 967 |
+
void **host_addr,
|
| 968 |
+
void **device_addr,
|
| 969 |
+
size_t *bytes,
|
| 970 |
+
unsigned int *mapping_flags,
|
| 971 |
+
const void *codeptr_ra
|
| 972 |
+
);
|
| 973 |
+
|
| 974 |
+
typedef struct ompt_record_target_map_t {
|
| 975 |
+
ompt_id_t target_id;
|
| 976 |
+
unsigned int nitems;
|
| 977 |
+
void **host_addr;
|
| 978 |
+
void **device_addr;
|
| 979 |
+
size_t *bytes;
|
| 980 |
+
unsigned int *mapping_flags;
|
| 981 |
+
const void *codeptr_ra;
|
| 982 |
+
} ompt_record_target_map_t;
|
| 983 |
+
|
| 984 |
+
typedef void (*ompt_callback_target_submit_t) (
|
| 985 |
+
ompt_id_t target_id,
|
| 986 |
+
ompt_id_t host_op_id,
|
| 987 |
+
unsigned int requested_num_teams
|
| 988 |
+
);
|
| 989 |
+
|
| 990 |
+
typedef struct ompt_record_target_kernel_t {
|
| 991 |
+
ompt_id_t host_op_id;
|
| 992 |
+
unsigned int requested_num_teams;
|
| 993 |
+
unsigned int granted_num_teams;
|
| 994 |
+
ompt_device_time_t end_time;
|
| 995 |
+
} ompt_record_target_kernel_t;
|
| 996 |
+
|
| 997 |
+
typedef int (*ompt_callback_control_tool_t) (
|
| 998 |
+
uint64_t command,
|
| 999 |
+
uint64_t modifier,
|
| 1000 |
+
void *arg,
|
| 1001 |
+
const void *codeptr_ra
|
| 1002 |
+
);
|
| 1003 |
+
|
| 1004 |
+
typedef struct ompt_record_control_tool_t {
|
| 1005 |
+
uint64_t command;
|
| 1006 |
+
uint64_t modifier;
|
| 1007 |
+
const void *codeptr_ra;
|
| 1008 |
+
} ompt_record_control_tool_t;
|
| 1009 |
+
|
| 1010 |
+
typedef struct ompd_address_t {
|
| 1011 |
+
ompd_seg_t segment;
|
| 1012 |
+
ompd_addr_t address;
|
| 1013 |
+
} ompd_address_t;
|
| 1014 |
+
|
| 1015 |
+
typedef struct ompd_frame_info_t {
|
| 1016 |
+
ompd_address_t frame_address;
|
| 1017 |
+
ompd_word_t frame_flag;
|
| 1018 |
+
} ompd_frame_info_t;
|
| 1019 |
+
|
| 1020 |
+
typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
|
| 1021 |
+
typedef struct _ompd_thread_handle ompd_thread_handle_t;
|
| 1022 |
+
typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
|
| 1023 |
+
typedef struct _ompd_task_handle ompd_task_handle_t;
|
| 1024 |
+
|
| 1025 |
+
typedef struct _ompd_aspace_cont ompd_address_space_context_t;
|
| 1026 |
+
typedef struct _ompd_thread_cont ompd_thread_context_t;
|
| 1027 |
+
|
| 1028 |
+
typedef struct ompd_device_type_sizes_t {
|
| 1029 |
+
uint8_t sizeof_char;
|
| 1030 |
+
uint8_t sizeof_short;
|
| 1031 |
+
uint8_t sizeof_int;
|
| 1032 |
+
uint8_t sizeof_long;
|
| 1033 |
+
uint8_t sizeof_long_long;
|
| 1034 |
+
uint8_t sizeof_pointer;
|
| 1035 |
+
} ompd_device_type_sizes_t;
|
| 1036 |
+
|
| 1037 |
+
typedef struct ompt_record_ompt_t {
|
| 1038 |
+
ompt_callbacks_t type;
|
| 1039 |
+
ompt_device_time_t time;
|
| 1040 |
+
ompt_id_t thread_id;
|
| 1041 |
+
ompt_id_t target_id;
|
| 1042 |
+
union {
|
| 1043 |
+
ompt_record_thread_begin_t thread_begin;
|
| 1044 |
+
ompt_record_parallel_begin_t parallel_begin;
|
| 1045 |
+
ompt_record_parallel_end_t parallel_end;
|
| 1046 |
+
ompt_record_work_t work;
|
| 1047 |
+
ompt_record_dispatch_t dispatch;
|
| 1048 |
+
ompt_record_task_create_t task_create;
|
| 1049 |
+
ompt_record_dependences_t dependences;
|
| 1050 |
+
ompt_record_task_dependence_t task_dependence;
|
| 1051 |
+
ompt_record_task_schedule_t task_schedule;
|
| 1052 |
+
ompt_record_implicit_task_t implicit_task;
|
| 1053 |
+
ompt_record_master_t master;
|
| 1054 |
+
ompt_record_sync_region_t sync_region;
|
| 1055 |
+
ompt_record_mutex_acquire_t mutex_acquire;
|
| 1056 |
+
ompt_record_mutex_t mutex;
|
| 1057 |
+
ompt_record_nest_lock_t nest_lock;
|
| 1058 |
+
ompt_record_flush_t flush;
|
| 1059 |
+
ompt_record_cancel_t cancel;
|
| 1060 |
+
ompt_record_target_t target;
|
| 1061 |
+
ompt_record_target_data_op_t target_data_op;
|
| 1062 |
+
ompt_record_target_map_t target_map;
|
| 1063 |
+
ompt_record_target_kernel_t target_kernel;
|
| 1064 |
+
ompt_record_control_tool_t control_tool;
|
| 1065 |
+
} record;
|
| 1066 |
+
} ompt_record_ompt_t;
|
| 1067 |
+
|
| 1068 |
+
typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
|
| 1069 |
+
ompt_buffer_t *buffer,
|
| 1070 |
+
ompt_buffer_cursor_t current
|
| 1071 |
+
);
|
| 1072 |
+
|
| 1073 |
+
#define ompt_id_none 0
|
| 1074 |
+
#define ompt_data_none {0}
|
| 1075 |
+
#define ompt_time_none 0
|
| 1076 |
+
#define ompt_hwid_none 0
|
| 1077 |
+
#define ompt_addr_none ~0
|
| 1078 |
+
#define ompt_mutex_impl_none 0
|
| 1079 |
+
#define ompt_wait_id_none 0
|
| 1080 |
+
|
| 1081 |
+
#define ompd_segment_none 0
|
| 1082 |
+
|
| 1083 |
+
#endif /* __OMPT__ */
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h
ADDED
|
@@ -0,0 +1,588 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CHANNEL_DESCRIPTOR_H__)
|
| 51 |
+
#define __CHANNEL_DESCRIPTOR_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#include "cuda_runtime_api.h"
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
/**
|
| 70 |
+
* \addtogroup CUDART_HIGHLEVEL
|
| 71 |
+
*
|
| 72 |
+
* @{
|
| 73 |
+
*/
|
| 74 |
+
|
| 75 |
+
/**
|
| 76 |
+
* \brief \hl Returns a channel descriptor using the specified format
|
| 77 |
+
*
|
| 78 |
+
* Returns a channel descriptor with format \p f and number of bits of each
|
| 79 |
+
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
|
| 80 |
+
* defined as:
|
| 81 |
+
* \code
|
| 82 |
+
struct cudaChannelFormatDesc {
|
| 83 |
+
int x, y, z, w;
|
| 84 |
+
enum cudaChannelFormatKind f;
|
| 85 |
+
};
|
| 86 |
+
* \endcode
|
| 87 |
+
*
|
| 88 |
+
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
|
| 89 |
+
* ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
|
| 90 |
+
* ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
|
| 91 |
+
* ::cudaChannelFormatKindSignedNormalized8X4,
|
| 92 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
|
| 93 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X4,
|
| 94 |
+
* ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
|
| 95 |
+
* ::cudaChannelFormatKindSignedNormalized16X4,
|
| 96 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
|
| 97 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X4
|
| 98 |
+
* or ::cudaChannelFormatKindNV12.
|
| 99 |
+
*
|
| 100 |
+
* The format is specified by the template specialization.
|
| 101 |
+
*
|
| 102 |
+
* The template function specializes for the following scalar types:
|
| 103 |
+
* char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
|
| 104 |
+
* The template function specializes for the following vector types:
|
| 105 |
+
* char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
|
| 106 |
+
* The template function specializes for following cudaChannelFormatKind enum values:
|
| 107 |
+
* ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
|
| 108 |
+
*
|
| 109 |
+
* Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
|
| 110 |
+
*
|
| 111 |
+
* \return
|
| 112 |
+
* Channel descriptor with format \p f
|
| 113 |
+
*
|
| 114 |
+
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
|
| 115 |
+
* ::cudaGetChannelDesc,
|
| 116 |
+
*/
|
| 117 |
+
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 118 |
+
{
|
| 119 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
|
| 123 |
+
{
|
| 124 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 125 |
+
|
| 126 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
|
| 130 |
+
{
|
| 131 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 132 |
+
|
| 133 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
|
| 137 |
+
{
|
| 138 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 139 |
+
|
| 140 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
|
| 144 |
+
{
|
| 145 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 146 |
+
|
| 147 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
|
| 151 |
+
{
|
| 152 |
+
int e = (int)sizeof(char) * 8;
|
| 153 |
+
|
| 154 |
+
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
|
| 155 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 156 |
+
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 157 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 158 |
+
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
|
| 162 |
+
{
|
| 163 |
+
int e = (int)sizeof(signed char) * 8;
|
| 164 |
+
|
| 165 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
|
| 169 |
+
{
|
| 170 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 171 |
+
|
| 172 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
|
| 176 |
+
{
|
| 177 |
+
int e = (int)sizeof(signed char) * 8;
|
| 178 |
+
|
| 179 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
|
| 183 |
+
{
|
| 184 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 185 |
+
|
| 186 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
|
| 190 |
+
{
|
| 191 |
+
int e = (int)sizeof(signed char) * 8;
|
| 192 |
+
|
| 193 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
|
| 197 |
+
{
|
| 198 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 199 |
+
|
| 200 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
|
| 204 |
+
{
|
| 205 |
+
int e = (int)sizeof(signed char) * 8;
|
| 206 |
+
|
| 207 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
|
| 211 |
+
{
|
| 212 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 213 |
+
|
| 214 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
|
| 218 |
+
{
|
| 219 |
+
int e = (int)sizeof(short) * 8;
|
| 220 |
+
|
| 221 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
|
| 225 |
+
{
|
| 226 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 227 |
+
|
| 228 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
|
| 232 |
+
{
|
| 233 |
+
int e = (int)sizeof(short) * 8;
|
| 234 |
+
|
| 235 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
|
| 239 |
+
{
|
| 240 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 241 |
+
|
| 242 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
|
| 246 |
+
{
|
| 247 |
+
int e = (int)sizeof(short) * 8;
|
| 248 |
+
|
| 249 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
|
| 253 |
+
{
|
| 254 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 255 |
+
|
| 256 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
|
| 260 |
+
{
|
| 261 |
+
int e = (int)sizeof(short) * 8;
|
| 262 |
+
|
| 263 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
|
| 267 |
+
{
|
| 268 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 269 |
+
|
| 270 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
|
| 274 |
+
{
|
| 275 |
+
int e = (int)sizeof(int) * 8;
|
| 276 |
+
|
| 277 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
|
| 281 |
+
{
|
| 282 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 283 |
+
|
| 284 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
|
| 288 |
+
{
|
| 289 |
+
int e = (int)sizeof(int) * 8;
|
| 290 |
+
|
| 291 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
|
| 295 |
+
{
|
| 296 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 297 |
+
|
| 298 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
|
| 302 |
+
{
|
| 303 |
+
int e = (int)sizeof(int) * 8;
|
| 304 |
+
|
| 305 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
|
| 309 |
+
{
|
| 310 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 311 |
+
|
| 312 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
|
| 316 |
+
{
|
| 317 |
+
int e = (int)sizeof(int) * 8;
|
| 318 |
+
|
| 319 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
|
| 323 |
+
{
|
| 324 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 325 |
+
|
| 326 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
#if !defined(__LP64__)
|
| 330 |
+
|
| 331 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
|
| 332 |
+
{
|
| 333 |
+
int e = (int)sizeof(long) * 8;
|
| 334 |
+
|
| 335 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
|
| 339 |
+
{
|
| 340 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 341 |
+
|
| 342 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
|
| 346 |
+
{
|
| 347 |
+
int e = (int)sizeof(long) * 8;
|
| 348 |
+
|
| 349 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
|
| 353 |
+
{
|
| 354 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 355 |
+
|
| 356 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
|
| 360 |
+
{
|
| 361 |
+
int e = (int)sizeof(long) * 8;
|
| 362 |
+
|
| 363 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
|
| 367 |
+
{
|
| 368 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 369 |
+
|
| 370 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
|
| 374 |
+
{
|
| 375 |
+
int e = (int)sizeof(long) * 8;
|
| 376 |
+
|
| 377 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
|
| 381 |
+
{
|
| 382 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 383 |
+
|
| 384 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
#endif /* !__LP64__ */
|
| 388 |
+
|
| 389 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
|
| 390 |
+
{
|
| 391 |
+
int e = (int)sizeof(float) * 8;
|
| 392 |
+
|
| 393 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
|
| 397 |
+
{
|
| 398 |
+
int e = (int)sizeof(float) * 8;
|
| 399 |
+
|
| 400 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
|
| 404 |
+
{
|
| 405 |
+
int e = (int)sizeof(float) * 8;
|
| 406 |
+
|
| 407 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
|
| 411 |
+
{
|
| 412 |
+
int e = (int)sizeof(float) * 8;
|
| 413 |
+
|
| 414 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
|
| 418 |
+
{
|
| 419 |
+
int e = (int)sizeof(char) * 8;
|
| 420 |
+
|
| 421 |
+
return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 425 |
+
{
|
| 426 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
/* Signed 8-bit normalized integer formats */
|
| 430 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
|
| 431 |
+
{
|
| 432 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
|
| 436 |
+
{
|
| 437 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
|
| 441 |
+
{
|
| 442 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
/* Unsigned 8-bit normalized integer formats */
|
| 446 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
|
| 447 |
+
{
|
| 448 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
|
| 452 |
+
{
|
| 453 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
|
| 457 |
+
{
|
| 458 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
/* Signed 16-bit normalized integer formats */
|
| 462 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
|
| 463 |
+
{
|
| 464 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
|
| 468 |
+
{
|
| 469 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
|
| 473 |
+
{
|
| 474 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
/* Unsigned 16-bit normalized integer formats */
|
| 478 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
|
| 479 |
+
{
|
| 480 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
|
| 484 |
+
{
|
| 485 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
|
| 489 |
+
{
|
| 490 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
/* NV12 format */
|
| 494 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
|
| 495 |
+
{
|
| 496 |
+
return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
/* BC1 format */
|
| 500 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
|
| 501 |
+
{
|
| 502 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
/* BC1sRGB format */
|
| 506 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
|
| 507 |
+
{
|
| 508 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
/* BC2 format */
|
| 512 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
|
| 513 |
+
{
|
| 514 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
/* BC2sRGB format */
|
| 518 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
|
| 519 |
+
{
|
| 520 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
|
| 521 |
+
}
|
| 522 |
+
|
| 523 |
+
/* BC3 format */
|
| 524 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
|
| 525 |
+
{
|
| 526 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
/* BC3sRGB format */
|
| 530 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
|
| 531 |
+
{
|
| 532 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
/* BC4 unsigned format */
|
| 536 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
|
| 537 |
+
{
|
| 538 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
+
/* BC4 signed format */
|
| 542 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
|
| 543 |
+
{
|
| 544 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
/* BC5 unsigned format */
|
| 548 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
|
| 549 |
+
{
|
| 550 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
/* BC5 signed format */
|
| 554 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
|
| 555 |
+
{
|
| 556 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
/* BC6H unsigned format */
|
| 560 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
|
| 561 |
+
{
|
| 562 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
/* BC6H signed format */
|
| 566 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
|
| 567 |
+
{
|
| 568 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
/* BC7 format */
|
| 572 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
|
| 573 |
+
{
|
| 574 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
/* BC7sRGB format */
|
| 578 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
|
| 579 |
+
{
|
| 580 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
#endif /* __cplusplus */
|
| 584 |
+
|
| 585 |
+
/** @} */
|
| 586 |
+
/** @} */ /* END CUDART_TEXTURE_HL */
|
| 587 |
+
|
| 588 |
+
#endif /* !__CHANNEL_DESCRIPTOR_H__ */
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h
ADDED
|
@@ -0,0 +1,1730 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _COOPERATIVE_GROUPS_H_
|
| 51 |
+
#define _COOPERATIVE_GROUPS_H_
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
#include "cooperative_groups/details/info.h"
|
| 56 |
+
#include "cooperative_groups/details/driver_abi.h"
|
| 57 |
+
#include "cooperative_groups/details/helpers.h"
|
| 58 |
+
#include "cooperative_groups/details/memory.h"
|
| 59 |
+
|
| 60 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 61 |
+
#include <cuda/atomic>
|
| 62 |
+
#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
|
| 63 |
+
#else
|
| 64 |
+
#define _CG_THREAD_SCOPE(scope)
|
| 65 |
+
#endif
|
| 66 |
+
|
| 67 |
+
_CG_BEGIN_NAMESPACE
|
| 68 |
+
|
| 69 |
+
namespace details {
|
| 70 |
+
_CG_CONST_DECL unsigned int coalesced_group_id = 1;
|
| 71 |
+
_CG_CONST_DECL unsigned int multi_grid_group_id = 2;
|
| 72 |
+
_CG_CONST_DECL unsigned int grid_group_id = 3;
|
| 73 |
+
_CG_CONST_DECL unsigned int thread_block_id = 4;
|
| 74 |
+
_CG_CONST_DECL unsigned int multi_tile_group_id = 5;
|
| 75 |
+
_CG_CONST_DECL unsigned int cluster_group_id = 6;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
* class thread_group;
|
| 80 |
+
*
|
| 81 |
+
* Generic thread group type, into which all groups are convertible.
|
| 82 |
+
* It acts as a container for all storage necessary for the derived groups,
|
| 83 |
+
* and will dispatch the API calls to the correct derived group. This means
|
| 84 |
+
* that all derived groups must implement the same interface as thread_group.
|
| 85 |
+
*/
|
| 86 |
+
class thread_group
|
| 87 |
+
{
|
| 88 |
+
protected:
|
| 89 |
+
struct group_data {
|
| 90 |
+
unsigned int _unused : 1;
|
| 91 |
+
unsigned int type : 7, : 0;
|
| 92 |
+
};
|
| 93 |
+
|
| 94 |
+
struct gg_data {
|
| 95 |
+
details::grid_workspace *gridWs;
|
| 96 |
+
};
|
| 97 |
+
|
| 98 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 99 |
+
struct mg_data {
|
| 100 |
+
unsigned long long _unused : 1;
|
| 101 |
+
unsigned long long type : 7;
|
| 102 |
+
unsigned long long handle : 56;
|
| 103 |
+
const details::multi_grid::multi_grid_functions *functions;
|
| 104 |
+
};
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
struct tg_data {
|
| 108 |
+
unsigned int is_tiled : 1;
|
| 109 |
+
unsigned int type : 7;
|
| 110 |
+
unsigned int size : 24;
|
| 111 |
+
// packed to 4b
|
| 112 |
+
unsigned int metaGroupSize : 16;
|
| 113 |
+
unsigned int metaGroupRank : 16;
|
| 114 |
+
// packed to 8b
|
| 115 |
+
unsigned int mask;
|
| 116 |
+
// packed to 12b
|
| 117 |
+
unsigned int _res;
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 121 |
+
friend class thread_block;
|
| 122 |
+
|
| 123 |
+
union __align__(8) {
|
| 124 |
+
group_data group;
|
| 125 |
+
tg_data coalesced;
|
| 126 |
+
gg_data grid;
|
| 127 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 128 |
+
mg_data multi_grid;
|
| 129 |
+
#endif
|
| 130 |
+
} _data;
|
| 131 |
+
|
| 132 |
+
_CG_QUALIFIER thread_group operator=(const thread_group& src);
|
| 133 |
+
|
| 134 |
+
_CG_QUALIFIER thread_group(unsigned int type) {
|
| 135 |
+
_data.group.type = type;
|
| 136 |
+
_data.group._unused = false;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
#ifdef _CG_CPP11_FEATURES
|
| 140 |
+
static_assert(sizeof(tg_data) <= 16, "Failed size check");
|
| 141 |
+
static_assert(sizeof(gg_data) <= 16, "Failed size check");
|
| 142 |
+
# ifdef _CG_ABI_EXPERIMENTAL
|
| 143 |
+
static_assert(sizeof(mg_data) <= 16, "Failed size check");
|
| 144 |
+
# endif
|
| 145 |
+
#endif
|
| 146 |
+
|
| 147 |
+
public:
|
| 148 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 149 |
+
|
| 150 |
+
_CG_QUALIFIER unsigned long long size() const;
|
| 151 |
+
_CG_QUALIFIER unsigned long long num_threads() const;
|
| 152 |
+
_CG_QUALIFIER unsigned long long thread_rank() const;
|
| 153 |
+
_CG_QUALIFIER void sync() const;
|
| 154 |
+
_CG_QUALIFIER unsigned int get_type() const {
|
| 155 |
+
return _data.group.type;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
};
|
| 159 |
+
|
| 160 |
+
template <unsigned int TyId>
|
| 161 |
+
struct thread_group_base : public thread_group {
|
| 162 |
+
_CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
|
| 163 |
+
_CG_STATIC_CONST_DECL unsigned int id = TyId;
|
| 164 |
+
};
|
| 165 |
+
|
| 166 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 167 |
+
|
| 168 |
+
/**
|
| 169 |
+
* class multi_grid_group;
|
| 170 |
+
*
|
| 171 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 172 |
+
* same system, on multiple devices within the same launched kernels.
|
| 173 |
+
* To use this group, the kernel must have been launched with
|
| 174 |
+
* cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
|
| 175 |
+
* and the device must support it (queryable device attribute).
|
| 176 |
+
*
|
| 177 |
+
* Constructed via this_multi_grid();
|
| 178 |
+
*/
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 182 |
+
class multi_grid_group;
|
| 183 |
+
|
| 184 |
+
// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
|
| 185 |
+
template <typename = void>
|
| 186 |
+
__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
|
| 187 |
+
|
| 188 |
+
class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
|
| 189 |
+
{
|
| 190 |
+
private:
|
| 191 |
+
template <typename = void>
|
| 192 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 193 |
+
_data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
|
| 194 |
+
_data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
friend multi_grid_group this_multi_grid<void>();
|
| 198 |
+
|
| 199 |
+
public:
|
| 200 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 201 |
+
|
| 202 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 203 |
+
return (_data.multi_grid.handle != 0);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
_CG_QUALIFIER void sync() const {
|
| 207 |
+
if (!is_valid()) {
|
| 208 |
+
_CG_ABORT();
|
| 209 |
+
}
|
| 210 |
+
_data.multi_grid.functions->sync(_data.multi_grid.handle);
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
_CG_QUALIFIER unsigned long long num_threads() const {
|
| 214 |
+
_CG_ASSERT(is_valid());
|
| 215 |
+
return _data.multi_grid.functions->size(_data.multi_grid.handle);
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
_CG_QUALIFIER unsigned long long size() const {
|
| 219 |
+
return num_threads();
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_CG_QUALIFIER unsigned long long thread_rank() const {
|
| 223 |
+
_CG_ASSERT(is_valid());
|
| 224 |
+
return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
_CG_QUALIFIER unsigned int grid_rank() const {
|
| 228 |
+
_CG_ASSERT(is_valid());
|
| 229 |
+
return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
_CG_QUALIFIER unsigned int num_grids() const {
|
| 233 |
+
_CG_ASSERT(is_valid());
|
| 234 |
+
return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
|
| 235 |
+
}
|
| 236 |
+
};
|
| 237 |
+
# else
|
| 238 |
+
class multi_grid_group
|
| 239 |
+
{
|
| 240 |
+
private:
|
| 241 |
+
unsigned long long _handle;
|
| 242 |
+
unsigned int _size;
|
| 243 |
+
unsigned int _rank;
|
| 244 |
+
|
| 245 |
+
friend _CG_QUALIFIER multi_grid_group this_multi_grid();
|
| 246 |
+
|
| 247 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 248 |
+
_handle = details::multi_grid::get_intrinsic_handle();
|
| 249 |
+
_size = details::multi_grid::size(_handle);
|
| 250 |
+
_rank = details::multi_grid::thread_rank(_handle);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
public:
|
| 254 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 255 |
+
|
| 256 |
+
_CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
|
| 257 |
+
return (_handle != 0);
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
_CG_QUALIFIER _CG_DEPRECATED void sync() const {
|
| 261 |
+
if (!is_valid()) {
|
| 262 |
+
_CG_ABORT();
|
| 263 |
+
}
|
| 264 |
+
details::multi_grid::sync(_handle);
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
|
| 268 |
+
_CG_ASSERT(is_valid());
|
| 269 |
+
return _size;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
|
| 273 |
+
return num_threads();
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
|
| 277 |
+
_CG_ASSERT(is_valid());
|
| 278 |
+
return _rank;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
|
| 282 |
+
_CG_ASSERT(is_valid());
|
| 283 |
+
return (details::multi_grid::grid_rank(_handle));
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
|
| 287 |
+
_CG_ASSERT(is_valid());
|
| 288 |
+
return (details::multi_grid::num_grids(_handle));
|
| 289 |
+
}
|
| 290 |
+
};
|
| 291 |
+
# endif
|
| 292 |
+
|
| 293 |
+
/**
|
| 294 |
+
* multi_grid_group this_multi_grid()
|
| 295 |
+
*
|
| 296 |
+
* Constructs a multi_grid_group
|
| 297 |
+
*/
|
| 298 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 299 |
+
template <typename>
|
| 300 |
+
__device__
|
| 301 |
+
#else
|
| 302 |
+
_CG_QUALIFIER
|
| 303 |
+
# endif
|
| 304 |
+
_CG_DEPRECATED
|
| 305 |
+
multi_grid_group this_multi_grid()
|
| 306 |
+
{
|
| 307 |
+
return multi_grid_group();
|
| 308 |
+
}
|
| 309 |
+
#endif
|
| 310 |
+
|
| 311 |
+
/**
|
| 312 |
+
* class grid_group;
|
| 313 |
+
*
|
| 314 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 315 |
+
* same device within the same launched kernel. To use this group, the kernel
|
| 316 |
+
* must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
|
| 317 |
+
* and the device must support it (queryable device attribute).
|
| 318 |
+
*
|
| 319 |
+
* Constructed via this_grid();
|
| 320 |
+
*/
|
| 321 |
+
class grid_group : public thread_group_base<details::grid_group_id>
|
| 322 |
+
{
|
| 323 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
|
| 324 |
+
friend _CG_QUALIFIER grid_group this_grid();
|
| 325 |
+
|
| 326 |
+
private:
|
| 327 |
+
_CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
|
| 328 |
+
_data.grid.gridWs = gridWs;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
public:
|
| 332 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 333 |
+
|
| 334 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 335 |
+
return (_data.grid.gridWs != NULL);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
_CG_QUALIFIER void sync() const {
|
| 339 |
+
if (!is_valid()) {
|
| 340 |
+
_CG_ABORT();
|
| 341 |
+
}
|
| 342 |
+
details::grid::sync(&_data.grid.gridWs->barrier);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 346 |
+
using arrival_token = unsigned int;
|
| 347 |
+
|
| 348 |
+
_CG_QUALIFIER arrival_token barrier_arrive() const {
|
| 349 |
+
if (!is_valid()) {
|
| 350 |
+
_CG_ABORT();
|
| 351 |
+
}
|
| 352 |
+
return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
_CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
|
| 356 |
+
details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
|
| 357 |
+
}
|
| 358 |
+
#endif
|
| 359 |
+
|
| 360 |
+
_CG_STATIC_QUALIFIER unsigned long long size() {
|
| 361 |
+
return details::grid::size();
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 365 |
+
return details::grid::grid_dim();
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads() {
|
| 369 |
+
return details::grid::dim_threads();
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads() {
|
| 373 |
+
return details::grid::num_threads();
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
_CG_STATIC_QUALIFIER dim3 thread_index() {
|
| 377 |
+
return details::grid::thread_index();
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank() {
|
| 381 |
+
return details::grid::thread_rank();
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks() {
|
| 385 |
+
return details::grid::dim_blocks();
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks() {
|
| 389 |
+
return details::grid::num_blocks();
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
_CG_STATIC_QUALIFIER dim3 block_index() {
|
| 393 |
+
return details::grid::block_index();
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank() {
|
| 397 |
+
return details::grid::block_rank();
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
# if defined(_CG_HAS_CLUSTER_GROUP)
|
| 401 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 402 |
+
return details::grid::dim_clusters();
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 406 |
+
return details::grid::num_clusters();
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 410 |
+
return details::grid::cluster_index();
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 414 |
+
return details::grid::cluster_rank();
|
| 415 |
+
}
|
| 416 |
+
# endif
|
| 417 |
+
};
|
| 418 |
+
|
| 419 |
+
_CG_QUALIFIER grid_group this_grid() {
|
| 420 |
+
// Load a workspace from the driver
|
| 421 |
+
grid_group gg(details::get_grid_workspace());
|
| 422 |
+
#ifdef _CG_DEBUG
|
| 423 |
+
// *all* threads must be available to synchronize
|
| 424 |
+
gg.sync();
|
| 425 |
+
#endif // _CG_DEBUG
|
| 426 |
+
return gg;
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 430 |
+
/**
|
| 431 |
+
* class cluster_group
|
| 432 |
+
*
|
| 433 |
+
* Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
|
| 434 |
+
* divided along all dimensions to form groups of blocks, each group of which is
|
| 435 |
+
* a block cluster. Clustered grids are subject to various restrictions and
|
| 436 |
+
* limitations. Primarily, a cluster consists of at most 8 blocks by default
|
| 437 |
+
* (although the user is allowed to opt-in to non-standard sizes,) and clustered
|
| 438 |
+
* grids are subject to additional occupancy limitations due to per-cluster
|
| 439 |
+
* hardware resource consumption. In exchange, a block cluster is guaranteed to
|
| 440 |
+
* be a cooperative group, with access to all cooperative group capabilities, as
|
| 441 |
+
* well as cluster specific capabilities and accelerations. A cluster_group
|
| 442 |
+
* represents a block cluster.
|
| 443 |
+
*
|
| 444 |
+
* Constructed via this_cluster_group();
|
| 445 |
+
*/
|
| 446 |
+
class cluster_group : public thread_group_base<details::cluster_group_id>
|
| 447 |
+
{
|
| 448 |
+
// Friends
|
| 449 |
+
friend _CG_QUALIFIER cluster_group this_cluster();
|
| 450 |
+
|
| 451 |
+
// Disable constructor
|
| 452 |
+
_CG_QUALIFIER cluster_group()
|
| 453 |
+
{
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
public:
|
| 457 |
+
//_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
|
| 458 |
+
|
| 459 |
+
using arrival_token = struct {};
|
| 460 |
+
|
| 461 |
+
// Functionality exposed by the group
|
| 462 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 463 |
+
{
|
| 464 |
+
return details::cluster::sync();
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
_CG_STATIC_QUALIFIER arrival_token barrier_arrive()
|
| 468 |
+
{
|
| 469 |
+
details::cluster::barrier_arrive();
|
| 470 |
+
return arrival_token();
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 474 |
+
{
|
| 475 |
+
return details::cluster::barrier_wait();
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
_CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
|
| 479 |
+
{
|
| 480 |
+
return details::cluster::barrier_wait();
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 484 |
+
{
|
| 485 |
+
return details::cluster::query_shared_rank(addr);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
template <typename T>
|
| 489 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 490 |
+
{
|
| 491 |
+
return details::cluster::map_shared_rank(addr, rank);
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 495 |
+
{
|
| 496 |
+
return details::cluster::block_index();
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 500 |
+
{
|
| 501 |
+
return details::cluster::block_rank();
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 505 |
+
{
|
| 506 |
+
return details::cluster::thread_index();
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 510 |
+
{
|
| 511 |
+
return details::cluster::thread_rank();
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 515 |
+
{
|
| 516 |
+
return details::cluster::dim_blocks();
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 520 |
+
{
|
| 521 |
+
return details::cluster::num_blocks();
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 525 |
+
{
|
| 526 |
+
return details::cluster::dim_threads();
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 530 |
+
{
|
| 531 |
+
return details::cluster::num_threads();
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
// Legacy aliases
|
| 535 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 536 |
+
{
|
| 537 |
+
return num_threads();
|
| 538 |
+
}
|
| 539 |
+
};
|
| 540 |
+
|
| 541 |
+
/*
|
| 542 |
+
* cluster_group this_cluster()
|
| 543 |
+
*
|
| 544 |
+
* Constructs a cluster_group
|
| 545 |
+
*/
|
| 546 |
+
_CG_QUALIFIER cluster_group this_cluster()
|
| 547 |
+
{
|
| 548 |
+
cluster_group cg;
|
| 549 |
+
#ifdef _CG_DEBUG
|
| 550 |
+
cg.sync();
|
| 551 |
+
#endif
|
| 552 |
+
return cg;
|
| 553 |
+
}
|
| 554 |
+
#endif
|
| 555 |
+
|
| 556 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 557 |
+
class thread_block;
|
| 558 |
+
template <unsigned int MaxBlockSize>
|
| 559 |
+
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
|
| 560 |
+
#endif
|
| 561 |
+
|
| 562 |
+
/**
|
| 563 |
+
* class thread_block
|
| 564 |
+
*
|
| 565 |
+
* Every GPU kernel is executed by a grid of thread blocks, and threads within
|
| 566 |
+
* each block are guaranteed to reside on the same streaming multiprocessor.
|
| 567 |
+
* A thread_block represents a thread block whose dimensions are not known until runtime.
|
| 568 |
+
*
|
| 569 |
+
* Constructed via this_thread_block();
|
| 570 |
+
*/
|
| 571 |
+
class thread_block : public thread_group_base<details::thread_block_id>
|
| 572 |
+
{
|
| 573 |
+
// Friends
|
| 574 |
+
friend _CG_QUALIFIER thread_block this_thread_block();
|
| 575 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 576 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
|
| 577 |
+
|
| 578 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 579 |
+
template <unsigned int MaxBlockSize>
|
| 580 |
+
friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
|
| 581 |
+
template <unsigned int Size>
|
| 582 |
+
friend class __static_size_multi_warp_tile_base;
|
| 583 |
+
|
| 584 |
+
details::multi_warp_scratch* const tile_memory;
|
| 585 |
+
|
| 586 |
+
template <unsigned int MaxBlockSize>
|
| 587 |
+
_CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
|
| 588 |
+
tile_memory(details::get_scratch_ptr(&scratch)) {
|
| 589 |
+
#ifdef _CG_DEBUG
|
| 590 |
+
if (num_threads() > MaxBlockSize) {
|
| 591 |
+
details::abort();
|
| 592 |
+
}
|
| 593 |
+
#endif
|
| 594 |
+
#if !defined(_CG_HAS_RESERVED_SHARED)
|
| 595 |
+
tile_memory->init_barriers(thread_rank());
|
| 596 |
+
sync();
|
| 597 |
+
#endif
|
| 598 |
+
}
|
| 599 |
+
#endif
|
| 600 |
+
|
| 601 |
+
// Disable constructor
|
| 602 |
+
_CG_QUALIFIER thread_block()
|
| 603 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 604 |
+
: tile_memory(details::get_scratch_ptr(NULL))
|
| 605 |
+
#endif
|
| 606 |
+
{ }
|
| 607 |
+
|
| 608 |
+
// Internal Use
|
| 609 |
+
_CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
|
| 610 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 611 |
+
|
| 612 |
+
// Invalid, immediately fail
|
| 613 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 614 |
+
details::abort();
|
| 615 |
+
return (thread_block());
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
unsigned int mask;
|
| 619 |
+
unsigned int base_offset = thread_rank() & (~(tilesz - 1));
|
| 620 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 621 |
+
|
| 622 |
+
mask = (unsigned int)(-1) >> (32 - masklength);
|
| 623 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 624 |
+
thread_group tile = thread_group(details::coalesced_group_id);
|
| 625 |
+
tile._data.coalesced.mask = mask;
|
| 626 |
+
tile._data.coalesced.size = __popc(mask);
|
| 627 |
+
tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
|
| 628 |
+
tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
|
| 629 |
+
tile._data.coalesced.is_tiled = true;
|
| 630 |
+
return (tile);
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
public:
|
| 634 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
|
| 635 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 636 |
+
|
| 637 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 638 |
+
details::cta::sync();
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 642 |
+
struct arrival_token {};
|
| 643 |
+
|
| 644 |
+
_CG_QUALIFIER arrival_token barrier_arrive() const {
|
| 645 |
+
return arrival_token();
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
_CG_QUALIFIER void barrier_wait(arrival_token&&) const {
|
| 649 |
+
details::cta::sync();
|
| 650 |
+
}
|
| 651 |
+
#endif
|
| 652 |
+
|
| 653 |
+
_CG_STATIC_QUALIFIER unsigned int size() {
|
| 654 |
+
return details::cta::size();
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 658 |
+
return details::cta::thread_rank();
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
// Additional functionality exposed by the group
|
| 662 |
+
_CG_STATIC_QUALIFIER dim3 group_index() {
|
| 663 |
+
return details::cta::group_index();
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
_CG_STATIC_QUALIFIER dim3 thread_index() {
|
| 667 |
+
return details::cta::thread_index();
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 671 |
+
return details::cta::block_dim();
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads() {
|
| 675 |
+
return details::cta::dim_threads();
|
| 676 |
+
}
|
| 677 |
+
|
| 678 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads() {
|
| 679 |
+
return details::cta::num_threads();
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
};
|
| 683 |
+
|
| 684 |
+
/**
|
| 685 |
+
* thread_block this_thread_block()
|
| 686 |
+
*
|
| 687 |
+
* Constructs a thread_block group
|
| 688 |
+
*/
|
| 689 |
+
_CG_QUALIFIER thread_block this_thread_block()
|
| 690 |
+
{
|
| 691 |
+
return (thread_block());
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 695 |
+
template <unsigned int MaxBlockSize>
|
| 696 |
+
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
|
| 697 |
+
return (thread_block(scratch));
|
| 698 |
+
}
|
| 699 |
+
#endif
|
| 700 |
+
|
| 701 |
+
/**
|
| 702 |
+
* class coalesced_group
|
| 703 |
+
*
|
| 704 |
+
* A group representing the current set of converged threads in a warp.
|
| 705 |
+
* The size of the group is not guaranteed and it may return a group of
|
| 706 |
+
* only one thread (itself).
|
| 707 |
+
*
|
| 708 |
+
* This group exposes warp-synchronous builtins.
|
| 709 |
+
* Constructed via coalesced_threads();
|
| 710 |
+
*/
|
| 711 |
+
class coalesced_group : public thread_group_base<details::coalesced_group_id>
|
| 712 |
+
{
|
| 713 |
+
private:
|
| 714 |
+
friend _CG_QUALIFIER coalesced_group coalesced_threads();
|
| 715 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 716 |
+
friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
|
| 717 |
+
friend class details::_coalesced_group_data_access;
|
| 718 |
+
|
| 719 |
+
_CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
|
| 720 |
+
unsigned int member_pack = 0;
|
| 721 |
+
unsigned int member_rank = 0;
|
| 722 |
+
for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 723 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 724 |
+
if (lane_bit) {
|
| 725 |
+
if (laneMask & lane_bit)
|
| 726 |
+
member_pack |= 1 << member_rank;
|
| 727 |
+
member_rank++;
|
| 728 |
+
}
|
| 729 |
+
}
|
| 730 |
+
return (member_pack);
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
// Internal Use
|
| 734 |
+
_CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
|
| 735 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 736 |
+
|
| 737 |
+
// Invalid, immediately fail
|
| 738 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 739 |
+
details::abort();
|
| 740 |
+
return (coalesced_group(0));
|
| 741 |
+
}
|
| 742 |
+
if (size() <= tilesz) {
|
| 743 |
+
return (*this);
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
|
| 747 |
+
unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
|
| 748 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 749 |
+
unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
|
| 750 |
+
|
| 751 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 752 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 753 |
+
coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
|
| 754 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 755 |
+
coalesced_tile._data.coalesced.is_tiled = true;
|
| 756 |
+
return (coalesced_tile);
|
| 757 |
+
}
|
| 758 |
+
else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
|
| 759 |
+
unsigned int mask = 0;
|
| 760 |
+
unsigned int member_rank = 0;
|
| 761 |
+
int seen_lanes = (thread_rank() / tilesz) * tilesz;
|
| 762 |
+
for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 763 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 764 |
+
if (lane_bit) {
|
| 765 |
+
if (seen_lanes <= 0 && member_rank < tilesz) {
|
| 766 |
+
mask |= lane_bit;
|
| 767 |
+
member_rank++;
|
| 768 |
+
}
|
| 769 |
+
seen_lanes--;
|
| 770 |
+
}
|
| 771 |
+
}
|
| 772 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 773 |
+
// Override parent with the size of this group
|
| 774 |
+
coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
|
| 775 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 776 |
+
return coalesced_tile;
|
| 777 |
+
}
|
| 778 |
+
else {
|
| 779 |
+
// None in _CG_VERSION 1000
|
| 780 |
+
details::abort();
|
| 781 |
+
}
|
| 782 |
+
|
| 783 |
+
return (coalesced_group(0));
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
protected:
|
| 787 |
+
_CG_QUALIFIER coalesced_group(unsigned int mask) {
|
| 788 |
+
_data.coalesced.mask = mask;
|
| 789 |
+
_data.coalesced.size = __popc(mask);
|
| 790 |
+
_data.coalesced.metaGroupRank = 0;
|
| 791 |
+
_data.coalesced.metaGroupSize = 1;
|
| 792 |
+
_data.coalesced.is_tiled = false;
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 796 |
+
return (_data.coalesced.mask);
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
+
public:
|
| 800 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 801 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 802 |
+
|
| 803 |
+
_CG_QUALIFIER unsigned int num_threads() const {
|
| 804 |
+
return _data.coalesced.size;
|
| 805 |
+
}
|
| 806 |
+
|
| 807 |
+
_CG_QUALIFIER unsigned int size() const {
|
| 808 |
+
return num_threads();
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
_CG_QUALIFIER unsigned int thread_rank() const {
|
| 812 |
+
return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
|
| 813 |
+
}
|
| 814 |
+
|
| 815 |
+
// Rank of this group in the upper level of the hierarchy
|
| 816 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 817 |
+
return _data.coalesced.metaGroupRank;
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 821 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 822 |
+
return _data.coalesced.metaGroupSize;
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
_CG_QUALIFIER void sync() const {
|
| 826 |
+
__syncwarp(_data.coalesced.mask);
|
| 827 |
+
}
|
| 828 |
+
|
| 829 |
+
#ifdef _CG_CPP11_FEATURES
|
| 830 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 831 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 832 |
+
unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 833 |
+
(size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
|
| 834 |
+
|
| 835 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 836 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 837 |
+
}
|
| 838 |
+
|
| 839 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 840 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 841 |
+
if (size() == 32) {
|
| 842 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 843 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 844 |
+
}
|
| 845 |
+
|
| 846 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 847 |
+
|
| 848 |
+
if (lane >= 32)
|
| 849 |
+
lane = details::laneid();
|
| 850 |
+
|
| 851 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 852 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 853 |
+
}
|
| 854 |
+
|
| 855 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 856 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
|
| 857 |
+
if (size() == 32) {
|
| 858 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 859 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 860 |
+
}
|
| 861 |
+
|
| 862 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 863 |
+
if (lane >= 32)
|
| 864 |
+
lane = details::laneid();
|
| 865 |
+
|
| 866 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 867 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 868 |
+
}
|
| 869 |
+
#else
|
| 870 |
+
template <typename TyIntegral>
|
| 871 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
|
| 872 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 873 |
+
unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 874 |
+
(size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
|
| 875 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 876 |
+
}
|
| 877 |
+
|
| 878 |
+
template <typename TyIntegral>
|
| 879 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
|
| 880 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 881 |
+
if (size() == 32) {
|
| 882 |
+
return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
|
| 883 |
+
}
|
| 884 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 885 |
+
if (lane >= 32) lane = details::laneid();
|
| 886 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
template <typename TyIntegral>
|
| 890 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
|
| 891 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 892 |
+
if (size() == 32) {
|
| 893 |
+
return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
|
| 894 |
+
}
|
| 895 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 896 |
+
if (lane >= 32) lane = details::laneid();
|
| 897 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 898 |
+
}
|
| 899 |
+
#endif
|
| 900 |
+
|
| 901 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 902 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
|
| 903 |
+
}
|
| 904 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 905 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
|
| 906 |
+
}
|
| 907 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 908 |
+
if (size() == 32) {
|
| 909 |
+
return (__ballot_sync(0xFFFFFFFF, predicate));
|
| 910 |
+
}
|
| 911 |
+
unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
|
| 912 |
+
return (_packLanes(lane_ballot));
|
| 913 |
+
}
|
| 914 |
+
|
| 915 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 916 |
+
|
| 917 |
+
template <typename TyIntegral>
|
| 918 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 919 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 920 |
+
if (size() == 32) {
|
| 921 |
+
return (__match_any_sync(0xFFFFFFFF, val));
|
| 922 |
+
}
|
| 923 |
+
unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
|
| 924 |
+
return (_packLanes(lane_match));
|
| 925 |
+
}
|
| 926 |
+
|
| 927 |
+
template <typename TyIntegral>
|
| 928 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 929 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 930 |
+
if (size() == 32) {
|
| 931 |
+
return (__match_all_sync(0xFFFFFFFF, val, &pred));
|
| 932 |
+
}
|
| 933 |
+
unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
|
| 934 |
+
return (_packLanes(lane_match));
|
| 935 |
+
}
|
| 936 |
+
|
| 937 |
+
#endif /* !_CG_HAS_MATCH_COLLECTIVE */
|
| 938 |
+
|
| 939 |
+
};
|
| 940 |
+
|
| 941 |
+
_CG_QUALIFIER coalesced_group coalesced_threads()
|
| 942 |
+
{
|
| 943 |
+
return (coalesced_group(__activemask()));
|
| 944 |
+
}
|
| 945 |
+
|
| 946 |
+
namespace details {
|
| 947 |
+
template <unsigned int Size> struct verify_thread_block_tile_size;
|
| 948 |
+
template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
|
| 949 |
+
template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
|
| 950 |
+
template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
|
| 951 |
+
template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
|
| 952 |
+
template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
|
| 953 |
+
template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
|
| 954 |
+
|
| 955 |
+
#ifdef _CG_CPP11_FEATURES
|
| 956 |
+
template <unsigned int Size>
|
| 957 |
+
using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
|
| 958 |
+
|
| 959 |
+
template <unsigned int Size>
|
| 960 |
+
using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
|
| 961 |
+
template <unsigned int Size>
|
| 962 |
+
using _is_multi_warp =
|
| 963 |
+
_CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
|
| 964 |
+
|
| 965 |
+
template <unsigned int Size>
|
| 966 |
+
using _is_valid_single_warp_tile =
|
| 967 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
|
| 968 |
+
template <unsigned int Size>
|
| 969 |
+
using _is_valid_multi_warp_tile =
|
| 970 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
|
| 971 |
+
#else
|
| 972 |
+
template <unsigned int Size>
|
| 973 |
+
struct _is_multi_warp {
|
| 974 |
+
static const bool value = false;
|
| 975 |
+
};
|
| 976 |
+
#endif
|
| 977 |
+
}
|
| 978 |
+
|
| 979 |
+
template <unsigned int Size>
|
| 980 |
+
class __static_size_tile_base
|
| 981 |
+
{
|
| 982 |
+
protected:
|
| 983 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 984 |
+
|
| 985 |
+
public:
|
| 986 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 987 |
+
|
| 988 |
+
// Rank of thread within tile
|
| 989 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 990 |
+
return (details::cta::thread_rank() & (numThreads - 1));
|
| 991 |
+
}
|
| 992 |
+
|
| 993 |
+
// Number of threads within tile
|
| 994 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
|
| 995 |
+
return numThreads;
|
| 996 |
+
}
|
| 997 |
+
|
| 998 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
|
| 999 |
+
return num_threads();
|
| 1000 |
+
}
|
| 1001 |
+
};
|
| 1002 |
+
|
| 1003 |
+
template <unsigned int Size>
|
| 1004 |
+
class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
|
| 1005 |
+
{
|
| 1006 |
+
friend class details::_coalesced_group_data_access;
|
| 1007 |
+
typedef details::tile::tile_helpers<Size> th;
|
| 1008 |
+
|
| 1009 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1010 |
+
static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
|
| 1011 |
+
#else
|
| 1012 |
+
typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
|
| 1013 |
+
#endif
|
| 1014 |
+
using __static_size_tile_base<Size>::numThreads;
|
| 1015 |
+
_CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
|
| 1016 |
+
|
| 1017 |
+
protected:
|
| 1018 |
+
_CG_STATIC_QUALIFIER unsigned int build_mask() {
|
| 1019 |
+
unsigned int mask = fullMask;
|
| 1020 |
+
if (numThreads != 32) {
|
| 1021 |
+
// [0,31] representing the current active thread in the warp
|
| 1022 |
+
unsigned int laneId = details::laneid();
|
| 1023 |
+
// shift mask according to the partition it belongs to
|
| 1024 |
+
mask = th::tileMask << (laneId & ~(th::laneMask));
|
| 1025 |
+
}
|
| 1026 |
+
return (mask);
|
| 1027 |
+
}
|
| 1028 |
+
|
| 1029 |
+
public:
|
| 1030 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 1031 |
+
|
| 1032 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 1033 |
+
__syncwarp(build_mask());
|
| 1034 |
+
}
|
| 1035 |
+
|
| 1036 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1037 |
+
// PTX supported collectives
|
| 1038 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1039 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 1040 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 1041 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
|
| 1042 |
+
}
|
| 1043 |
+
|
| 1044 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1045 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 1046 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 1047 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1048 |
+
}
|
| 1049 |
+
|
| 1050 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1051 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
|
| 1052 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 1053 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1054 |
+
}
|
| 1055 |
+
|
| 1056 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1057 |
+
_CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
|
| 1058 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
|
| 1059 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
|
| 1060 |
+
}
|
| 1061 |
+
#else
|
| 1062 |
+
template <typename TyIntegral>
|
| 1063 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
|
| 1064 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1065 |
+
return (__shfl_sync(build_mask(), var, srcRank, numThreads));
|
| 1066 |
+
}
|
| 1067 |
+
|
| 1068 |
+
template <typename TyIntegral>
|
| 1069 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
|
| 1070 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1071 |
+
return (__shfl_down_sync(build_mask(), var, delta, numThreads));
|
| 1072 |
+
}
|
| 1073 |
+
|
| 1074 |
+
template <typename TyIntegral>
|
| 1075 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
|
| 1076 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1077 |
+
return (__shfl_up_sync(build_mask(), var, delta, numThreads));
|
| 1078 |
+
}
|
| 1079 |
+
|
| 1080 |
+
template <typename TyIntegral>
|
| 1081 |
+
_CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
|
| 1082 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1083 |
+
return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
|
| 1084 |
+
}
|
| 1085 |
+
#endif //_CG_CPP11_FEATURES
|
| 1086 |
+
|
| 1087 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1088 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1089 |
+
return (lane_ballot != 0);
|
| 1090 |
+
}
|
| 1091 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1092 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1093 |
+
return (lane_ballot == build_mask());
|
| 1094 |
+
}
|
| 1095 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 1096 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1097 |
+
return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
|
| 1098 |
+
}
|
| 1099 |
+
|
| 1100 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 1101 |
+
template <typename TyIntegral>
|
| 1102 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 1103 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1104 |
+
unsigned int lane_match = __match_any_sync(build_mask(), val);
|
| 1105 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1106 |
+
}
|
| 1107 |
+
|
| 1108 |
+
template <typename TyIntegral>
|
| 1109 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 1110 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1111 |
+
unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
|
| 1112 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1113 |
+
}
|
| 1114 |
+
#endif
|
| 1115 |
+
|
| 1116 |
+
};
|
| 1117 |
+
|
| 1118 |
+
template <unsigned int Size, typename ParentT>
|
| 1119 |
+
class __static_parent_thread_block_tile_base
|
| 1120 |
+
{
|
| 1121 |
+
public:
|
| 1122 |
+
// Rank of this group in the upper level of the hierarchy
|
| 1123 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
|
| 1124 |
+
return ParentT::thread_rank() / Size;
|
| 1125 |
+
}
|
| 1126 |
+
|
| 1127 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 1128 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_size() {
|
| 1129 |
+
return (ParentT::size() + Size - 1) / Size;
|
| 1130 |
+
}
|
| 1131 |
+
};
|
| 1132 |
+
|
| 1133 |
+
/**
|
| 1134 |
+
* class thread_block_tile<unsigned int Size, ParentT = void>
|
| 1135 |
+
*
|
| 1136 |
+
* Statically-sized group type, representing one tile of a thread block.
|
| 1137 |
+
* The only specializations currently supported are those with native
|
| 1138 |
+
* hardware support (1/2/4/8/16/32)
|
| 1139 |
+
*
|
| 1140 |
+
* This group exposes warp-synchronous builtins.
|
| 1141 |
+
* Can only be constructed via tiled_partition<Size>(ParentT&)
|
| 1142 |
+
*/
|
| 1143 |
+
|
| 1144 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1145 |
+
class __single_warp_thread_block_tile :
|
| 1146 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1147 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1148 |
+
{
|
| 1149 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1150 |
+
friend class details::_coalesced_group_data_access;
|
| 1151 |
+
|
| 1152 |
+
protected:
|
| 1153 |
+
_CG_QUALIFIER __single_warp_thread_block_tile() { };
|
| 1154 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
|
| 1155 |
+
|
| 1156 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask() {
|
| 1157 |
+
return __static_size_thread_block_tile_base<Size>::build_mask();
|
| 1158 |
+
}
|
| 1159 |
+
};
|
| 1160 |
+
|
| 1161 |
+
template <unsigned int Size>
|
| 1162 |
+
class __single_warp_thread_block_tile<Size, void> :
|
| 1163 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1164 |
+
public thread_group_base<details::coalesced_group_id>
|
| 1165 |
+
{
|
| 1166 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 1167 |
+
|
| 1168 |
+
template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
|
| 1169 |
+
friend class details::_coalesced_group_data_access;
|
| 1170 |
+
|
| 1171 |
+
typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
|
| 1172 |
+
|
| 1173 |
+
protected:
|
| 1174 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
|
| 1175 |
+
_data.coalesced.mask = staticSizeBaseT::build_mask();
|
| 1176 |
+
_data.coalesced.size = numThreads;
|
| 1177 |
+
_data.coalesced.metaGroupRank = meta_group_rank;
|
| 1178 |
+
_data.coalesced.metaGroupSize = meta_group_size;
|
| 1179 |
+
_data.coalesced.is_tiled = true;
|
| 1180 |
+
}
|
| 1181 |
+
|
| 1182 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 1183 |
+
return (_data.coalesced.mask);
|
| 1184 |
+
}
|
| 1185 |
+
|
| 1186 |
+
public:
|
| 1187 |
+
using staticSizeBaseT::sync;
|
| 1188 |
+
using staticSizeBaseT::size;
|
| 1189 |
+
using staticSizeBaseT::num_threads;
|
| 1190 |
+
using staticSizeBaseT::thread_rank;
|
| 1191 |
+
|
| 1192 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1193 |
+
return _data.coalesced.metaGroupRank;
|
| 1194 |
+
}
|
| 1195 |
+
|
| 1196 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1197 |
+
return _data.coalesced.metaGroupSize;
|
| 1198 |
+
}
|
| 1199 |
+
};
|
| 1200 |
+
|
| 1201 |
+
/**
|
| 1202 |
+
* Outer level API calls
|
| 1203 |
+
* void sync(GroupT) - see <group_type>.sync()
|
| 1204 |
+
* void thread_rank(GroupT) - see <group_type>.thread_rank()
|
| 1205 |
+
* void group_size(GroupT) - see <group_type>.size()
|
| 1206 |
+
*/
|
| 1207 |
+
template <class GroupT>
|
| 1208 |
+
_CG_QUALIFIER void sync(GroupT const &g)
|
| 1209 |
+
{
|
| 1210 |
+
g.sync();
|
| 1211 |
+
}
|
| 1212 |
+
|
| 1213 |
+
// TODO: Use a static dispatch to determine appropriate return type
|
| 1214 |
+
// C++03 is stuck with unsigned long long for now
|
| 1215 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1216 |
+
template <class GroupT>
|
| 1217 |
+
_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
|
| 1218 |
+
return g.thread_rank();
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
+
|
| 1222 |
+
template <class GroupT>
|
| 1223 |
+
_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
|
| 1224 |
+
return g.num_threads();
|
| 1225 |
+
}
|
| 1226 |
+
#else
|
| 1227 |
+
template <class GroupT>
|
| 1228 |
+
_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
|
| 1229 |
+
return static_cast<unsigned long long>(g.thread_rank());
|
| 1230 |
+
}
|
| 1231 |
+
|
| 1232 |
+
|
| 1233 |
+
template <class GroupT>
|
| 1234 |
+
_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
|
| 1235 |
+
return static_cast<unsigned long long>(g.num_threads());
|
| 1236 |
+
}
|
| 1237 |
+
#endif
|
| 1238 |
+
|
| 1239 |
+
|
| 1240 |
+
/**
|
| 1241 |
+
* tiled_partition
|
| 1242 |
+
*
|
| 1243 |
+
* The tiled_partition(parent, tilesz) method is a collective operation that
|
| 1244 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1245 |
+
*
|
| 1246 |
+
* A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
|
| 1247 |
+
* be created where threads having identical k = (thread_rank(parent)/tilesz)
|
| 1248 |
+
* will be members of the same subgroup.
|
| 1249 |
+
*
|
| 1250 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1251 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1252 |
+
*
|
| 1253 |
+
* Functionality is limited to power-of-two sized subgorup instances of at most
|
| 1254 |
+
* 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
|
| 1255 |
+
* tiled_partition() in _CG_VERSION 1000.
|
| 1256 |
+
*/
|
| 1257 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
|
| 1258 |
+
{
|
| 1259 |
+
if (parent.get_type() == details::coalesced_group_id) {
|
| 1260 |
+
const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
|
| 1261 |
+
return _cg->_get_tiled_threads(tilesz);
|
| 1262 |
+
}
|
| 1263 |
+
else {
|
| 1264 |
+
const thread_block *_tb = static_cast<const thread_block*>(&parent);
|
| 1265 |
+
return _tb->_get_tiled_threads(tilesz);
|
| 1266 |
+
}
|
| 1267 |
+
}
|
| 1268 |
+
|
| 1269 |
+
// Thread block type overload: returns a basic thread_group for now (may be specialized later)
|
| 1270 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
|
| 1271 |
+
{
|
| 1272 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1273 |
+
}
|
| 1274 |
+
|
| 1275 |
+
// Coalesced group type overload: retains its ability to stay coalesced
|
| 1276 |
+
_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
|
| 1277 |
+
{
|
| 1278 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1279 |
+
}
|
| 1280 |
+
|
| 1281 |
+
namespace details {
|
| 1282 |
+
template <unsigned int Size, typename ParentT>
|
| 1283 |
+
class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
|
| 1284 |
+
|
| 1285 |
+
template <unsigned int Size, typename ParentT>
|
| 1286 |
+
_CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
|
| 1287 |
+
return internal_thread_block_tile<Size, ParentT>();
|
| 1288 |
+
}
|
| 1289 |
+
|
| 1290 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1291 |
+
_CG_QUALIFIER TyVal multi_warp_collectives_helper(
|
| 1292 |
+
const GroupT& group,
|
| 1293 |
+
WarpLambda warp_lambda,
|
| 1294 |
+
InterWarpLambda inter_warp_lambda) {
|
| 1295 |
+
return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
|
| 1296 |
+
}
|
| 1297 |
+
|
| 1298 |
+
template <typename T, typename GroupT>
|
| 1299 |
+
_CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
|
| 1300 |
+
return group.template get_scratch_location<T>(warp_id);
|
| 1301 |
+
}
|
| 1302 |
+
|
| 1303 |
+
template <typename GroupT>
|
| 1304 |
+
_CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
|
| 1305 |
+
return group.get_sync_location();
|
| 1306 |
+
}
|
| 1307 |
+
|
| 1308 |
+
}
|
| 1309 |
+
/**
|
| 1310 |
+
* tiled_partition<tilesz>
|
| 1311 |
+
*
|
| 1312 |
+
* The tiled_partition<tilesz>(parent) method is a collective operation that
|
| 1313 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1314 |
+
*
|
| 1315 |
+
* A total of ((size(parent)/tilesz) subgroups will be created,
|
| 1316 |
+
* therefore the parent group size must be evenly divisible by the tilesz.
|
| 1317 |
+
* The allow parent groups are thread_block or thread_block_tile<size>.
|
| 1318 |
+
*
|
| 1319 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1320 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1321 |
+
*
|
| 1322 |
+
* Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
|
| 1323 |
+
* The size(parent) must be greater than the template Size parameter
|
| 1324 |
+
* otherwise the results are undefined.
|
| 1325 |
+
*/
|
| 1326 |
+
|
| 1327 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 1328 |
+
template <unsigned int Size>
|
| 1329 |
+
class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
|
| 1330 |
+
{
|
| 1331 |
+
static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
|
| 1332 |
+
|
| 1333 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1334 |
+
friend __device__ TyVal details::multi_warp_collectives_helper(
|
| 1335 |
+
const GroupT& group,
|
| 1336 |
+
WarpLambda warp_lambda,
|
| 1337 |
+
InterWarpLambda inter_warp_lambda);
|
| 1338 |
+
template <typename T, typename GroupT>
|
| 1339 |
+
friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
|
| 1340 |
+
template <typename GroupT>
|
| 1341 |
+
friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
|
| 1342 |
+
template <unsigned int OtherSize>
|
| 1343 |
+
friend class __static_size_multi_warp_tile_base;
|
| 1344 |
+
using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 1345 |
+
using ThisType = __static_size_multi_warp_tile_base<Size>;
|
| 1346 |
+
_CG_STATIC_CONST_DECL int numWarps = Size / 32;
|
| 1347 |
+
|
| 1348 |
+
protected:
|
| 1349 |
+
details::multi_warp_scratch* const tile_memory;
|
| 1350 |
+
|
| 1351 |
+
template <typename GroupT>
|
| 1352 |
+
_CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
|
| 1353 |
+
#if defined(_CG_HAS_RESERVED_SHARED)
|
| 1354 |
+
details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
|
| 1355 |
+
g.sync();
|
| 1356 |
+
#endif
|
| 1357 |
+
}
|
| 1358 |
+
|
| 1359 |
+
|
| 1360 |
+
private:
|
| 1361 |
+
_CG_QUALIFIER details::barrier_t* get_sync_location() const {
|
| 1362 |
+
// Different group sizes use different barriers, all groups of a given size share one barrier.
|
| 1363 |
+
unsigned int sync_id = details::log2(Size / 64);
|
| 1364 |
+
return &tile_memory->barriers[sync_id];
|
| 1365 |
+
}
|
| 1366 |
+
|
| 1367 |
+
template <typename T>
|
| 1368 |
+
_CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
|
| 1369 |
+
unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
|
| 1370 |
+
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
|
| 1371 |
+
}
|
| 1372 |
+
|
| 1373 |
+
template <typename T>
|
| 1374 |
+
_CG_QUALIFIER T* get_scratch_location() const {
|
| 1375 |
+
unsigned int scratch_id = details::cta::thread_rank() / 32;
|
| 1376 |
+
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
|
| 1377 |
+
}
|
| 1378 |
+
|
| 1379 |
+
template <typename TyVal>
|
| 1380 |
+
_CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
|
| 1381 |
+
unsigned int src_warp = src / 32;
|
| 1382 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1383 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1384 |
+
|
| 1385 |
+
// Get warp slot of the source threads warp.
|
| 1386 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
|
| 1387 |
+
|
| 1388 |
+
if (warp.meta_group_rank() == src_warp) {
|
| 1389 |
+
warp.sync();
|
| 1390 |
+
// Put shuffled value into my warp slot and let my warp arrive at the barrier.
|
| 1391 |
+
if (thread_rank() == src) {
|
| 1392 |
+
*warp_scratch_location = val;
|
| 1393 |
+
}
|
| 1394 |
+
details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
|
| 1395 |
+
TyVal result = *warp_scratch_location;
|
| 1396 |
+
details::sync_warps_wait(sync_location, details::cta::thread_rank());
|
| 1397 |
+
return result;
|
| 1398 |
+
}
|
| 1399 |
+
else {
|
| 1400 |
+
// Wait for the source warp to arrive on the barrier.
|
| 1401 |
+
details::sync_warps_wait_for_specific_warp(sync_location,
|
| 1402 |
+
(details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
|
| 1403 |
+
TyVal result = *warp_scratch_location;
|
| 1404 |
+
details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
|
| 1405 |
+
return result;
|
| 1406 |
+
}
|
| 1407 |
+
}
|
| 1408 |
+
|
| 1409 |
+
template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
|
| 1410 |
+
_CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
|
| 1411 |
+
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
|
| 1412 |
+
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
|
| 1413 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1414 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1415 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>();
|
| 1416 |
+
|
| 1417 |
+
warp_lambda(warp, warp_scratch_location);
|
| 1418 |
+
|
| 1419 |
+
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
|
| 1420 |
+
auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
|
| 1421 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 1422 |
+
TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
|
| 1423 |
+
inter_warp_lambda(subwarp, thread_scratch_location);
|
| 1424 |
+
}
|
| 1425 |
+
warp.sync();
|
| 1426 |
+
details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
|
| 1427 |
+
}
|
| 1428 |
+
TyVal result = *warp_scratch_location;
|
| 1429 |
+
return result;
|
| 1430 |
+
}
|
| 1431 |
+
|
| 1432 |
+
public:
|
| 1433 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
|
| 1434 |
+
|
| 1435 |
+
using __static_size_tile_base<Size>::thread_rank;
|
| 1436 |
+
|
| 1437 |
+
template <typename TyVal>
|
| 1438 |
+
_CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
|
| 1439 |
+
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
|
| 1440 |
+
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
|
| 1441 |
+
return shfl_impl(val, src);
|
| 1442 |
+
}
|
| 1443 |
+
|
| 1444 |
+
_CG_QUALIFIER void sync() const {
|
| 1445 |
+
details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
|
| 1446 |
+
}
|
| 1447 |
+
|
| 1448 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1449 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1450 |
+
*warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
|
| 1451 |
+
};
|
| 1452 |
+
auto inter_warp_lambda =
|
| 1453 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1454 |
+
*thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1455 |
+
};
|
| 1456 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1457 |
+
}
|
| 1458 |
+
|
| 1459 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1460 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1461 |
+
*warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
|
| 1462 |
+
};
|
| 1463 |
+
auto inter_warp_lambda =
|
| 1464 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1465 |
+
*thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1466 |
+
};
|
| 1467 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1468 |
+
}
|
| 1469 |
+
};
|
| 1470 |
+
|
| 1471 |
+
|
| 1472 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1473 |
+
class __multi_warp_thread_block_tile :
|
| 1474 |
+
public __static_size_multi_warp_tile_base<Size>,
|
| 1475 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1476 |
+
{
|
| 1477 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1478 |
+
typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
|
| 1479 |
+
protected:
|
| 1480 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
|
| 1481 |
+
__static_size_multi_warp_tile_base<Size>(g) {}
|
| 1482 |
+
};
|
| 1483 |
+
|
| 1484 |
+
template <unsigned int Size>
|
| 1485 |
+
class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
|
| 1486 |
+
{
|
| 1487 |
+
const unsigned int metaGroupRank;
|
| 1488 |
+
const unsigned int metaGroupSize;
|
| 1489 |
+
|
| 1490 |
+
protected:
|
| 1491 |
+
template <unsigned int OtherSize, typename ParentT>
|
| 1492 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
|
| 1493 |
+
__static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
|
| 1494 |
+
|
| 1495 |
+
public:
|
| 1496 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1497 |
+
return metaGroupRank;
|
| 1498 |
+
}
|
| 1499 |
+
|
| 1500 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1501 |
+
return metaGroupSize;
|
| 1502 |
+
}
|
| 1503 |
+
};
|
| 1504 |
+
#endif
|
| 1505 |
+
|
| 1506 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1507 |
+
class thread_block_tile;
|
| 1508 |
+
|
| 1509 |
+
namespace details {
|
| 1510 |
+
template <unsigned int Size, typename ParentT, bool IsMultiWarp>
|
| 1511 |
+
class thread_block_tile_impl;
|
| 1512 |
+
|
| 1513 |
+
template <unsigned int Size, typename ParentT>
|
| 1514 |
+
class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
|
| 1515 |
+
{
|
| 1516 |
+
protected:
|
| 1517 |
+
template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
|
| 1518 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
|
| 1519 |
+
__single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
|
| 1520 |
+
|
| 1521 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
|
| 1522 |
+
__single_warp_thread_block_tile<Size, ParentT>() {}
|
| 1523 |
+
};
|
| 1524 |
+
|
| 1525 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 1526 |
+
template <unsigned int Size, typename ParentT>
|
| 1527 |
+
class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
|
| 1528 |
+
{
|
| 1529 |
+
protected:
|
| 1530 |
+
template <typename GroupT>
|
| 1531 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
|
| 1532 |
+
__multi_warp_thread_block_tile<Size, ParentT>(g) {}
|
| 1533 |
+
};
|
| 1534 |
+
#else
|
| 1535 |
+
template <unsigned int Size, typename ParentT>
|
| 1536 |
+
class thread_block_tile_impl<Size, ParentT, true>
|
| 1537 |
+
{
|
| 1538 |
+
protected:
|
| 1539 |
+
template <typename GroupT>
|
| 1540 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
|
| 1541 |
+
};
|
| 1542 |
+
#endif
|
| 1543 |
+
}
|
| 1544 |
+
|
| 1545 |
+
template <unsigned int Size, typename ParentT>
|
| 1546 |
+
class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
|
| 1547 |
+
{
|
| 1548 |
+
friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
|
| 1549 |
+
|
| 1550 |
+
protected:
|
| 1551 |
+
_CG_QUALIFIER thread_block_tile(const ParentT& g) :
|
| 1552 |
+
details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
|
| 1553 |
+
|
| 1554 |
+
public:
|
| 1555 |
+
_CG_QUALIFIER operator thread_block_tile<Size, void>() const {
|
| 1556 |
+
return thread_block_tile<Size, void>(*this);
|
| 1557 |
+
}
|
| 1558 |
+
};
|
| 1559 |
+
|
| 1560 |
+
template <unsigned int Size>
|
| 1561 |
+
class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
|
| 1562 |
+
{
|
| 1563 |
+
template <unsigned int, typename ParentT>
|
| 1564 |
+
friend class thread_block_tile;
|
| 1565 |
+
|
| 1566 |
+
protected:
|
| 1567 |
+
template <unsigned int OtherSize, typename OtherParentT>
|
| 1568 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
|
| 1569 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1570 |
+
|
| 1571 |
+
public:
|
| 1572 |
+
template <typename ParentT>
|
| 1573 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
|
| 1574 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1575 |
+
};
|
| 1576 |
+
|
| 1577 |
+
namespace details {
|
| 1578 |
+
template <unsigned int Size, typename ParentT>
|
| 1579 |
+
struct tiled_partition_impl;
|
| 1580 |
+
|
| 1581 |
+
template <unsigned int Size>
|
| 1582 |
+
struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
|
| 1583 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
|
| 1584 |
+
thread_block_tile<Size, thread_block>(g) {}
|
| 1585 |
+
};
|
| 1586 |
+
|
| 1587 |
+
// ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
|
| 1588 |
+
template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
|
| 1589 |
+
struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
|
| 1590 |
+
public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
|
| 1591 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1592 |
+
static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
|
| 1593 |
+
#endif
|
| 1594 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
|
| 1595 |
+
thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
|
| 1596 |
+
};
|
| 1597 |
+
|
| 1598 |
+
}
|
| 1599 |
+
|
| 1600 |
+
template <unsigned int Size, typename ParentT>
|
| 1601 |
+
_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
|
| 1602 |
+
{
|
| 1603 |
+
return details::tiled_partition_impl<Size, ParentT>(g);
|
| 1604 |
+
}
|
| 1605 |
+
|
| 1606 |
+
/**
|
| 1607 |
+
* thread_group this_thread()
|
| 1608 |
+
*
|
| 1609 |
+
* Constructs a generic thread_group containing only the calling thread
|
| 1610 |
+
*/
|
| 1611 |
+
_CG_QUALIFIER thread_block_tile<1, void> this_thread()
|
| 1612 |
+
{
|
| 1613 |
+
// Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
|
| 1614 |
+
// meta group rank and size set to 0 and 1 respectively.
|
| 1615 |
+
return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
|
| 1616 |
+
}
|
| 1617 |
+
|
| 1618 |
+
/**
|
| 1619 |
+
* <group_type>.sync()
|
| 1620 |
+
*
|
| 1621 |
+
* Executes a barrier across the group
|
| 1622 |
+
*
|
| 1623 |
+
* Implements both a compiler fence and an architectural fence to prevent,
|
| 1624 |
+
* memory reordering around the barrier.
|
| 1625 |
+
*/
|
| 1626 |
+
_CG_QUALIFIER void thread_group::sync() const
|
| 1627 |
+
{
|
| 1628 |
+
switch (_data.group.type) {
|
| 1629 |
+
case details::coalesced_group_id:
|
| 1630 |
+
cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
|
| 1631 |
+
break;
|
| 1632 |
+
case details::thread_block_id:
|
| 1633 |
+
cooperative_groups::sync(*static_cast<const thread_block*>(this));
|
| 1634 |
+
break;
|
| 1635 |
+
case details::grid_group_id:
|
| 1636 |
+
cooperative_groups::sync(*static_cast<const grid_group*>(this));
|
| 1637 |
+
break;
|
| 1638 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1639 |
+
case details::multi_grid_group_id:
|
| 1640 |
+
cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
|
| 1641 |
+
break;
|
| 1642 |
+
#endif
|
| 1643 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1644 |
+
case details::cluster_group_id:
|
| 1645 |
+
cooperative_groups::sync(*static_cast<const cluster_group*>(this));
|
| 1646 |
+
break;
|
| 1647 |
+
#endif
|
| 1648 |
+
default:
|
| 1649 |
+
break;
|
| 1650 |
+
}
|
| 1651 |
+
}
|
| 1652 |
+
|
| 1653 |
+
/**
|
| 1654 |
+
* <group_type>.size()
|
| 1655 |
+
*
|
| 1656 |
+
* Returns the total number of threads in the group.
|
| 1657 |
+
*/
|
| 1658 |
+
_CG_QUALIFIER unsigned long long thread_group::size() const
|
| 1659 |
+
{
|
| 1660 |
+
unsigned long long size = 0;
|
| 1661 |
+
switch (_data.group.type) {
|
| 1662 |
+
case details::coalesced_group_id:
|
| 1663 |
+
size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
|
| 1664 |
+
break;
|
| 1665 |
+
case details::thread_block_id:
|
| 1666 |
+
size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
|
| 1667 |
+
break;
|
| 1668 |
+
case details::grid_group_id:
|
| 1669 |
+
size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
|
| 1670 |
+
break;
|
| 1671 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1672 |
+
case details::multi_grid_group_id:
|
| 1673 |
+
size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
|
| 1674 |
+
break;
|
| 1675 |
+
#endif
|
| 1676 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1677 |
+
case details::cluster_group_id:
|
| 1678 |
+
size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
|
| 1679 |
+
break;
|
| 1680 |
+
#endif
|
| 1681 |
+
default:
|
| 1682 |
+
break;
|
| 1683 |
+
}
|
| 1684 |
+
return size;
|
| 1685 |
+
}
|
| 1686 |
+
|
| 1687 |
+
/**
|
| 1688 |
+
* <group_type>.thread_rank()
|
| 1689 |
+
*
|
| 1690 |
+
* Returns the linearized rank of the calling thread along the interval [0, size()).
|
| 1691 |
+
*/
|
| 1692 |
+
_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
|
| 1693 |
+
{
|
| 1694 |
+
unsigned long long rank = 0;
|
| 1695 |
+
switch (_data.group.type) {
|
| 1696 |
+
case details::coalesced_group_id:
|
| 1697 |
+
rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
|
| 1698 |
+
break;
|
| 1699 |
+
case details::thread_block_id:
|
| 1700 |
+
rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
|
| 1701 |
+
break;
|
| 1702 |
+
case details::grid_group_id:
|
| 1703 |
+
rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
|
| 1704 |
+
break;
|
| 1705 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1706 |
+
case details::multi_grid_group_id:
|
| 1707 |
+
rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
|
| 1708 |
+
break;
|
| 1709 |
+
#endif
|
| 1710 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1711 |
+
case details::cluster_group_id:
|
| 1712 |
+
rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
|
| 1713 |
+
break;
|
| 1714 |
+
#endif
|
| 1715 |
+
default:
|
| 1716 |
+
break;
|
| 1717 |
+
}
|
| 1718 |
+
return rank;
|
| 1719 |
+
}
|
| 1720 |
+
|
| 1721 |
+
_CG_END_NAMESPACE
|
| 1722 |
+
|
| 1723 |
+
#include <cooperative_groups/details/partitioning.h>
|
| 1724 |
+
#if (!defined(_MSC_VER) || defined(_WIN64))
|
| 1725 |
+
# include <cooperative_groups/details/invoke.h>
|
| 1726 |
+
#endif
|
| 1727 |
+
|
| 1728 |
+
# endif /* ! (__cplusplus, __CUDACC__) */
|
| 1729 |
+
|
| 1730 |
+
#endif /* !_COOPERATIVE_GROUPS_H_ */
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h
ADDED
|
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_ASYNC_H
|
| 50 |
+
#define _CG_ASYNC_H
|
| 51 |
+
|
| 52 |
+
#include "helpers.h"
|
| 53 |
+
#include "info.h"
|
| 54 |
+
|
| 55 |
+
#include <cuda_pipeline.h>
|
| 56 |
+
|
| 57 |
+
_CG_BEGIN_NAMESPACE
|
| 58 |
+
|
| 59 |
+
namespace details {
|
| 60 |
+
// Groups supported by memcpy_async
|
| 61 |
+
template <class TyGroup>
|
| 62 |
+
struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 63 |
+
|
| 64 |
+
template <unsigned int Sz, typename TyPar>
|
| 65 |
+
struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
|
| 66 |
+
: public _CG_STL_NAMESPACE::true_type {};
|
| 67 |
+
template <>
|
| 68 |
+
struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 69 |
+
template <>
|
| 70 |
+
struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
|
| 71 |
+
|
| 72 |
+
template <class TyGroup>
|
| 73 |
+
using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
|
| 74 |
+
|
| 75 |
+
// Groups that require optimization
|
| 76 |
+
template <class TyGroup>
|
| 77 |
+
struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
|
| 78 |
+
|
| 79 |
+
template <typename TyPar>
|
| 80 |
+
struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
|
| 81 |
+
: public _CG_STL_NAMESPACE::false_type {};
|
| 82 |
+
|
| 83 |
+
template <unsigned int Sz, typename TyPar>
|
| 84 |
+
struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
|
| 85 |
+
: public _CG_STL_NAMESPACE::true_type {};
|
| 86 |
+
|
| 87 |
+
template <class TyGroup>
|
| 88 |
+
using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
|
| 89 |
+
|
| 90 |
+
// SFINAE helpers for tile optimizations
|
| 91 |
+
template <class TyGroup>
|
| 92 |
+
using enable_tile_optimization =
|
| 93 |
+
typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
|
| 94 |
+
|
| 95 |
+
template <class TyGroup>
|
| 96 |
+
using disable_tile_optimization =
|
| 97 |
+
typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
|
| 98 |
+
|
| 99 |
+
// Segment for punning to aligned types
|
| 100 |
+
template <unsigned int N>
|
| 101 |
+
struct _Segment {
|
| 102 |
+
int _seg[N];
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
// Trivial layout guaranteed-aligned copy-async compatible segments
|
| 106 |
+
template <unsigned int N>
|
| 107 |
+
struct Segment;
|
| 108 |
+
template <>
|
| 109 |
+
struct __align__(4) Segment<1> : public _Segment<1>{};
|
| 110 |
+
template <>
|
| 111 |
+
struct __align__(8) Segment<2> : public _Segment<2>{};
|
| 112 |
+
template <>
|
| 113 |
+
struct __align__(16) Segment<4> : public _Segment<4>{};
|
| 114 |
+
|
| 115 |
+
// Interleaved element by element copies from source to dest
|
| 116 |
+
template <typename TyGroup, typename TyElem>
|
| 117 |
+
_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
|
| 118 |
+
size_t count) {
|
| 119 |
+
const unsigned int rank = group.thread_rank();
|
| 120 |
+
const unsigned int stride = group.size();
|
| 121 |
+
|
| 122 |
+
for (size_t idx = rank; idx < count; idx += stride) {
|
| 123 |
+
dst[idx] = src[idx];
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
|
| 128 |
+
_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
|
| 129 |
+
const TyElem *__restrict__ src, size_t count) {
|
| 130 |
+
static_assert(async_copy_group_supported<TyGroup>::value,
|
| 131 |
+
"Async copy is only supported for groups that represent private shared memory");
|
| 132 |
+
|
| 133 |
+
if (count == 0) {
|
| 134 |
+
return;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
const bool dstIsNotShared = !__isShared(dst);
|
| 138 |
+
const bool srcIsNotGlobal = !__isGlobal(src);
|
| 139 |
+
|
| 140 |
+
if (dstIsNotShared || srcIsNotGlobal) {
|
| 141 |
+
inline_copy(group, dst, src, count);
|
| 142 |
+
return;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
const unsigned int stride = group.size();
|
| 146 |
+
const unsigned int rank = group.thread_rank();
|
| 147 |
+
// Efficient copies require warps to operate on the same amount of work at each step.
|
| 148 |
+
// remainders are handled in a separate stage to prevent branching
|
| 149 |
+
const unsigned int subWarpMask = (stride - 1);
|
| 150 |
+
const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
|
| 151 |
+
const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
|
| 152 |
+
|
| 153 |
+
const size_t warpCopies = (count & (~subWarpMask));
|
| 154 |
+
|
| 155 |
+
for (size_t idx = 0; idx < warpCopies; idx += stride) {
|
| 156 |
+
size_t _srcIdx = rank + idx;
|
| 157 |
+
size_t _dstIdx = rank + idx;
|
| 158 |
+
__pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
if (subwarpCopies) {
|
| 162 |
+
size_t _srcIdx = warpCopies + maxSubwarpRank;
|
| 163 |
+
size_t _dstIdx = warpCopies + maxSubwarpRank;
|
| 164 |
+
__pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
|
| 169 |
+
_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
|
| 170 |
+
const TyElem *__restrict__ src, size_t count) {
|
| 171 |
+
static_assert(async_copy_group_supported<TyGroup>::value,
|
| 172 |
+
"Async copy is only supported for groups that represent private shared memory");
|
| 173 |
+
|
| 174 |
+
const bool dstIsNotShared = !__isShared(dst);
|
| 175 |
+
const bool srcIsNotGlobal = !__isGlobal(src);
|
| 176 |
+
|
| 177 |
+
if (dstIsNotShared || srcIsNotGlobal) {
|
| 178 |
+
inline_copy(group, dst, src, count);
|
| 179 |
+
return;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
unsigned int stride = group.size();
|
| 183 |
+
unsigned int rank = group.thread_rank();
|
| 184 |
+
|
| 185 |
+
for (size_t idx = rank; idx < count; idx += stride) {
|
| 186 |
+
size_t _srcIdx = idx;
|
| 187 |
+
size_t _dstIdx = idx;
|
| 188 |
+
__pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
// Determine best possible alignment given an input and initial conditions
|
| 193 |
+
// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
|
| 194 |
+
template <unsigned int MinAlignment, unsigned int MaxAlignment>
|
| 195 |
+
_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
|
| 196 |
+
// Narrowing conversion intentional
|
| 197 |
+
uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
|
| 198 |
+
uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
|
| 199 |
+
|
| 200 |
+
uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
|
| 201 |
+
|
| 202 |
+
// range [MaxAlignment, alignof(elem)], step: x >> 1
|
| 203 |
+
// over range of possible alignments, choose best available out of range
|
| 204 |
+
uint32_t out = MaxAlignment;
|
| 205 |
+
#pragma unroll
|
| 206 |
+
for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
|
| 207 |
+
if (alignment & diff)
|
| 208 |
+
out = alignment;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
return out;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
// Determine best possible alignment given an input and initial conditions
|
| 215 |
+
// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
|
| 216 |
+
template <typename TyType, typename TyGroup>
|
| 217 |
+
_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 218 |
+
size_t count) {
|
| 219 |
+
const char *src = reinterpret_cast<const char *>(_src);
|
| 220 |
+
char *dst = reinterpret_cast<char *>(_dst);
|
| 221 |
+
|
| 222 |
+
constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
|
| 223 |
+
|
| 224 |
+
uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
|
| 225 |
+
uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
|
| 226 |
+
|
| 227 |
+
inline_copy(group, dst, src, alignOffset);
|
| 228 |
+
count -= alignOffset;
|
| 229 |
+
src += alignOffset;
|
| 230 |
+
dst += alignOffset;
|
| 231 |
+
|
| 232 |
+
// Copy using the best available alignment, async_copy expects n-datums, not bytes
|
| 233 |
+
size_t asyncCount = count / sizeof(TyType);
|
| 234 |
+
accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
|
| 235 |
+
asyncCount *= sizeof(TyType);
|
| 236 |
+
|
| 237 |
+
count -= asyncCount;
|
| 238 |
+
src += asyncCount;
|
| 239 |
+
dst += asyncCount;
|
| 240 |
+
inline_copy(group, dst, src, count);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
// We must determine alignment and manually align src/dst ourselves
|
| 244 |
+
template <size_t AlignHint>
|
| 245 |
+
struct _memcpy_async_align_dispatch {
|
| 246 |
+
template <typename TyGroup>
|
| 247 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
|
| 248 |
+
uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
|
| 249 |
+
|
| 250 |
+
// Avoid copying the extra bytes if desired copy count is smaller
|
| 251 |
+
alignment = count < alignment ? AlignHint : alignment;
|
| 252 |
+
|
| 253 |
+
switch (alignment) {
|
| 254 |
+
default:
|
| 255 |
+
case 1:
|
| 256 |
+
inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
|
| 257 |
+
break;
|
| 258 |
+
case 2:
|
| 259 |
+
inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
|
| 260 |
+
break;
|
| 261 |
+
case 4:
|
| 262 |
+
copy_like<Segment<1>>(group, dst, src, count);
|
| 263 |
+
break;
|
| 264 |
+
case 8:
|
| 265 |
+
copy_like<Segment<2>>(group, dst, src, count);
|
| 266 |
+
break;
|
| 267 |
+
case 16:
|
| 268 |
+
copy_like<Segment<4>>(group, dst, src, count);
|
| 269 |
+
break;
|
| 270 |
+
}
|
| 271 |
+
}
|
| 272 |
+
};
|
| 273 |
+
|
| 274 |
+
// Specialization for 4 byte alignments
|
| 275 |
+
template <>
|
| 276 |
+
struct _memcpy_async_align_dispatch<4> {
|
| 277 |
+
template <typename TyGroup>
|
| 278 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 279 |
+
size_t count) {
|
| 280 |
+
const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
|
| 281 |
+
Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
|
| 282 |
+
|
| 283 |
+
// Dispatch straight to aligned LDGSTS calls
|
| 284 |
+
accelerated_async_copy(group, dst, src, count / sizeof(*dst));
|
| 285 |
+
}
|
| 286 |
+
};
|
| 287 |
+
|
| 288 |
+
// Specialization for 8 byte alignments
|
| 289 |
+
template <>
|
| 290 |
+
struct _memcpy_async_align_dispatch<8> {
|
| 291 |
+
template <typename TyGroup>
|
| 292 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 293 |
+
size_t count) {
|
| 294 |
+
const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
|
| 295 |
+
Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
|
| 296 |
+
|
| 297 |
+
// Dispatch straight to aligned LDGSTS calls
|
| 298 |
+
accelerated_async_copy(group, dst, src, count / sizeof(*dst));
|
| 299 |
+
}
|
| 300 |
+
};
|
| 301 |
+
|
| 302 |
+
// Alignments over 16 are truncated to 16 and bypass alignment
|
| 303 |
+
// This is the highest performing memcpy available
|
| 304 |
+
template <>
|
| 305 |
+
struct _memcpy_async_align_dispatch<16> {
|
| 306 |
+
template <typename TyGroup>
|
| 307 |
+
_CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
|
| 308 |
+
size_t count) {
|
| 309 |
+
const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
|
| 310 |
+
Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
|
| 311 |
+
|
| 312 |
+
// Dispatch straight to aligned LDGSTS calls
|
| 313 |
+
accelerated_async_copy(group, dst, src, count / sizeof(*dst));
|
| 314 |
+
}
|
| 315 |
+
};
|
| 316 |
+
|
| 317 |
+
// byte-wide API
|
| 318 |
+
template <size_t Alignment, class TyGroup>
|
| 319 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
|
| 320 |
+
const void *__restrict__ _src, size_t count) {
|
| 321 |
+
static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
|
| 322 |
+
details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
// Internal dispatch APIs
|
| 326 |
+
// These deduce the alignments and sizes necessary to invoke the underlying copy engine
|
| 327 |
+
template <typename Ty>
|
| 328 |
+
using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
|
| 329 |
+
|
| 330 |
+
template <typename Ty>
|
| 331 |
+
using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
|
| 332 |
+
|
| 333 |
+
template <typename Ty>
|
| 334 |
+
using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
|
| 335 |
+
|
| 336 |
+
template <typename Ty>
|
| 337 |
+
using enable_if_integral =
|
| 338 |
+
typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
|
| 339 |
+
|
| 340 |
+
// byte-wide API using aligned_sized_t
|
| 341 |
+
template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
|
| 342 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
|
| 343 |
+
const void *__restrict__ _src, const Alignment<Hint> &count) {
|
| 344 |
+
constexpr size_t _align = (Hint > 16) ? 16 : Hint;
|
| 345 |
+
|
| 346 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
// byte-wide API using type for aligment
|
| 350 |
+
template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
|
| 351 |
+
enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
|
| 352 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
|
| 353 |
+
const TyElem *__restrict__ _src, const TySize& count) {
|
| 354 |
+
constexpr size_t _align = (Hint > 16) ? 16 : Hint;
|
| 355 |
+
|
| 356 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
// byte-wide API with full alignment deduction required
|
| 360 |
+
template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
|
| 361 |
+
enable_if_integral<TySize> = nullptr>
|
| 362 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
|
| 363 |
+
const TyElem *__restrict__ _src, const TySize& count) {
|
| 364 |
+
details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
// 1d-datum API
|
| 368 |
+
template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
|
| 369 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
|
| 370 |
+
const TyElem *__restrict__ src, const size_t srcCount) {
|
| 371 |
+
constexpr unsigned int _align = Hint;
|
| 372 |
+
const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
|
| 373 |
+
|
| 374 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
// 1d-datum API using aligned_size_t
|
| 378 |
+
template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
|
| 379 |
+
_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
|
| 380 |
+
const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
|
| 381 |
+
constexpr unsigned int _align = Hint;
|
| 382 |
+
const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
|
| 383 |
+
|
| 384 |
+
details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
} // namespace details
|
| 388 |
+
|
| 389 |
+
/*
|
| 390 |
+
* Group submit batch of async-copy to cover contiguous 1D array
|
| 391 |
+
* and commit that batch to eventually wait for completion.
|
| 392 |
+
*/
|
| 393 |
+
template <class TyGroup, typename TyElem, typename TySizeT>
|
| 394 |
+
_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
|
| 395 |
+
const TySizeT &count) {
|
| 396 |
+
details::_memcpy_async_bytes(group, _dst, _src, count);
|
| 397 |
+
__pipeline_commit();
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
/*
|
| 401 |
+
* Group submit batch of async-copy to cover contiguous 1D array
|
| 402 |
+
* and commit that batch to eventually wait for completion.
|
| 403 |
+
* Object counts are in datum sized chunks, not bytes.
|
| 404 |
+
*/
|
| 405 |
+
template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
|
| 406 |
+
_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
|
| 407 |
+
const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
|
| 408 |
+
details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
|
| 409 |
+
__pipeline_commit();
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
/* Group wait for prior Nth stage of memcpy_async to complete. */
|
| 413 |
+
template <unsigned int Stage, class TyGroup>
|
| 414 |
+
_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
|
| 415 |
+
__pipeline_wait_prior(Stage);
|
| 416 |
+
group.sync();
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
/* Group wait all previously submitted memcpy_async to complete. */
|
| 420 |
+
template <class TyGroup>
|
| 421 |
+
_CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
|
| 422 |
+
__pipeline_wait_prior(0);
|
| 423 |
+
group.sync();
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
/***************** CG APIs including pipeline are deprecated *****************/
|
| 427 |
+
|
| 428 |
+
/* Group submit batch of async-copy to cover of contiguous 1D array
|
| 429 |
+
to a pipeline and commit the batch*/
|
| 430 |
+
template <class TyGroup, class TyElem>
|
| 431 |
+
_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
|
| 432 |
+
nvcuda::experimental::pipeline &pipe) {
|
| 433 |
+
details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
|
| 434 |
+
pipe.commit();
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
/* Group wait for prior Nth stage of memcpy_async to complete. */
|
| 438 |
+
template <unsigned int Stage, class TyGroup>
|
| 439 |
+
_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
|
| 440 |
+
pipe.wait_prior<Stage>();
|
| 441 |
+
group.sync();
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
/* Group wait for stage-S of memcpy_async to complete. */
|
| 445 |
+
template <class TyGroup>
|
| 446 |
+
_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
|
| 447 |
+
pipe.wait(stage);
|
| 448 |
+
group.sync();
|
| 449 |
+
}
|
| 450 |
+
_CG_END_NAMESPACE
|
| 451 |
+
|
| 452 |
+
#endif // _CG_ASYNC_H
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_COALESCED_REDUCE_H_
|
| 50 |
+
#define _CG_COALESCED_REDUCE_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "cooperative_groups.h"
|
| 55 |
+
#include "partitioning.h"
|
| 56 |
+
#include "coalesced_scan.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
|
| 63 |
+
_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
|
| 64 |
+
TyVal&& val,
|
| 65 |
+
TyOp&& op) -> decltype(op(val, val)) {
|
| 66 |
+
auto out = val;
|
| 67 |
+
for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
|
| 68 |
+
out = op(out, group.shfl_xor(out, mask));
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
return out;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
template <typename TyVal, typename TyOp>
|
| 75 |
+
_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 76 |
+
if (group.size() == 32) {
|
| 77 |
+
// Full coalesced group can go through faster path by being treated as a tile of size 32
|
| 78 |
+
auto tile = details::tiled_partition_internal<32, void>();
|
| 79 |
+
return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 80 |
+
}
|
| 81 |
+
else {
|
| 82 |
+
auto scan_result =
|
| 83 |
+
inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 84 |
+
unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
|
| 85 |
+
unsigned int last_thread_id = 31 - __clz(group_mask);
|
| 86 |
+
return details::tile::shuffle_dispatch<TyVal>::shfl(
|
| 87 |
+
_CG_STL_NAMESPACE::forward<TyVal>(scan_result), group_mask, last_thread_id, 32);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
} // details
|
| 92 |
+
|
| 93 |
+
_CG_END_NAMESPACE
|
| 94 |
+
|
| 95 |
+
#endif // _CG_COALESCED_REDUCE_H_
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_FUNCTIONAL_H
|
| 50 |
+
#define _CG_FUNCTIONAL_H
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
#ifdef _CG_USE_CUDA_STL
|
| 57 |
+
# include <cuda/std/functional>
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
_CG_BEGIN_NAMESPACE
|
| 61 |
+
|
| 62 |
+
namespace details {
|
| 63 |
+
#ifdef _CG_USE_CUDA_STL
|
| 64 |
+
using cuda::std::plus;
|
| 65 |
+
using cuda::std::bit_and;
|
| 66 |
+
using cuda::std::bit_xor;
|
| 67 |
+
using cuda::std::bit_or;
|
| 68 |
+
#else
|
| 69 |
+
template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
|
| 70 |
+
template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
|
| 71 |
+
template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
|
| 72 |
+
template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
|
| 73 |
+
#endif // _CG_USE_PLATFORM_STL
|
| 74 |
+
} // details
|
| 75 |
+
|
| 76 |
+
template <typename Ty>
|
| 77 |
+
struct plus : public details::plus<Ty> {};
|
| 78 |
+
|
| 79 |
+
template <typename Ty>
|
| 80 |
+
struct less {
|
| 81 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 82 |
+
return (arg2 < arg1) ? arg2 : arg1;
|
| 83 |
+
}
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
template <typename Ty>
|
| 87 |
+
struct greater {
|
| 88 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 89 |
+
return (arg1 < arg2) ? arg2 : arg1;
|
| 90 |
+
}
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
template <typename Ty>
|
| 94 |
+
struct bit_and : public details::bit_and<Ty> {};
|
| 95 |
+
|
| 96 |
+
template <typename Ty>
|
| 97 |
+
struct bit_xor : public details::bit_xor<Ty> {};
|
| 98 |
+
|
| 99 |
+
template <typename Ty>
|
| 100 |
+
struct bit_or : public details::bit_or<Ty> {};
|
| 101 |
+
|
| 102 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 103 |
+
namespace details {
|
| 104 |
+
template <class Ty>
|
| 105 |
+
using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
|
| 106 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
|
| 107 |
+
|
| 108 |
+
template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 109 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 110 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 111 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 112 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 113 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 114 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 115 |
+
|
| 116 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 117 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
|
| 118 |
+
auto old = atomic.load(cuda::std::memory_order_relaxed);
|
| 119 |
+
while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed));
|
| 120 |
+
return old;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
template<typename TyOp>
|
| 124 |
+
struct op_picker;
|
| 125 |
+
|
| 126 |
+
template<typename TyVal>
|
| 127 |
+
struct op_picker<cooperative_groups::plus<TyVal>> {
|
| 128 |
+
template<typename TyAtomic>
|
| 129 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 130 |
+
return atomic.fetch_add(val, cuda::std::memory_order_relaxed);
|
| 131 |
+
}
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
template<typename TyVal>
|
| 135 |
+
struct op_picker<cooperative_groups::less<TyVal>> {
|
| 136 |
+
template<typename TyAtomic>
|
| 137 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 138 |
+
return atomic.fetch_min(val, cuda::std::memory_order_relaxed);
|
| 139 |
+
}
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
template<typename TyVal>
|
| 143 |
+
struct op_picker<cooperative_groups::greater<TyVal>> {
|
| 144 |
+
template<typename TyAtomic>
|
| 145 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 146 |
+
return atomic.fetch_max(val, cuda::std::memory_order_relaxed);
|
| 147 |
+
}
|
| 148 |
+
};
|
| 149 |
+
|
| 150 |
+
template<typename TyVal>
|
| 151 |
+
struct op_picker<cooperative_groups::bit_and<TyVal>> {
|
| 152 |
+
template<typename TyAtomic>
|
| 153 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 154 |
+
return atomic.fetch_and(val, cuda::std::memory_order_relaxed);
|
| 155 |
+
}
|
| 156 |
+
};
|
| 157 |
+
|
| 158 |
+
template<typename TyVal>
|
| 159 |
+
struct op_picker<cooperative_groups::bit_xor<TyVal>> {
|
| 160 |
+
template<typename TyAtomic>
|
| 161 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 162 |
+
return atomic.fetch_xor(val, cuda::std::memory_order_relaxed);
|
| 163 |
+
}
|
| 164 |
+
};
|
| 165 |
+
|
| 166 |
+
template<typename TyVal>
|
| 167 |
+
struct op_picker<cooperative_groups::bit_or<TyVal>> {
|
| 168 |
+
template<typename TyAtomic>
|
| 169 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 170 |
+
return atomic.fetch_or(val, cuda::std::memory_order_relaxed);
|
| 171 |
+
}
|
| 172 |
+
};
|
| 173 |
+
|
| 174 |
+
template<bool atomic_supported>
|
| 175 |
+
struct atomic_update_dispatch {};
|
| 176 |
+
|
| 177 |
+
template<>
|
| 178 |
+
struct atomic_update_dispatch<false> {
|
| 179 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 180 |
+
_CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 181 |
+
return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 182 |
+
}
|
| 183 |
+
};
|
| 184 |
+
|
| 185 |
+
template<>
|
| 186 |
+
struct atomic_update_dispatch<true> {
|
| 187 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 188 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
|
| 189 |
+
using dispatch = op_picker<details::remove_qual<TyOp>>;
|
| 190 |
+
|
| 191 |
+
return dispatch::atomic_update(atomic, val);
|
| 192 |
+
}
|
| 193 |
+
};
|
| 194 |
+
|
| 195 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 196 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 197 |
+
using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
|
| 198 |
+
|
| 199 |
+
return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
template<typename TyAtomic, typename TyVal>
|
| 203 |
+
_CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) {
|
| 204 |
+
atomic.store(val, cuda::std::memory_order_relaxed);
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
#endif
|
| 208 |
+
|
| 209 |
+
_CG_END_NAMESPACE
|
| 210 |
+
|
| 211 |
+
#endif
|
| 212 |
+
#endif //_CG_FUNCTIONAL_H
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
|
| 50 |
+
# define _COOPERATIVE_GROUPS_HELPERS_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "sync.h"
|
| 54 |
+
|
| 55 |
+
_CG_BEGIN_NAMESPACE
|
| 56 |
+
|
| 57 |
+
namespace details {
|
| 58 |
+
#ifdef _CG_CPP11_FEATURES
|
| 59 |
+
template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
|
| 60 |
+
# ifdef _CG_HAS_FP16_COLLECTIVE
|
| 61 |
+
template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {};
|
| 62 |
+
template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
|
| 63 |
+
# endif
|
| 64 |
+
template <typename Ty>
|
| 65 |
+
using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
|
| 66 |
+
|
| 67 |
+
// Non-STL utility templates
|
| 68 |
+
template <typename Ty>
|
| 69 |
+
using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
|
| 70 |
+
|
| 71 |
+
template <typename TyLhs, typename TyRhs>
|
| 72 |
+
using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
|
| 73 |
+
>;
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
template <typename TyTrunc>
|
| 77 |
+
_CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
|
| 78 |
+
return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
|
| 79 |
+
((TyTrunc)index.y * nIndex.x) +
|
| 80 |
+
(TyTrunc)index.x;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
namespace cta {
|
| 84 |
+
|
| 85 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 86 |
+
{
|
| 87 |
+
__barrier_sync(0);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 91 |
+
{
|
| 92 |
+
return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 96 |
+
{
|
| 97 |
+
return vec3_to_linear<unsigned int>(threadIdx, blockDim);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
_CG_STATIC_QUALIFIER dim3 group_index()
|
| 101 |
+
{
|
| 102 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 106 |
+
{
|
| 107 |
+
return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 111 |
+
{
|
| 112 |
+
return dim3(blockDim.x, blockDim.y, blockDim.z);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Legacy aliases
|
| 116 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 117 |
+
{
|
| 118 |
+
return num_threads();
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
_CG_STATIC_QUALIFIER dim3 block_dim()
|
| 122 |
+
{
|
| 123 |
+
return dim_threads();
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
};
|
| 127 |
+
|
| 128 |
+
class _coalesced_group_data_access {
|
| 129 |
+
public:
|
| 130 |
+
// Retrieve mask of coalesced groups and tiles
|
| 131 |
+
template <typename TyGroup>
|
| 132 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
|
| 133 |
+
return group.get_mask();
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
template <typename TyGroup>
|
| 137 |
+
_CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
|
| 138 |
+
return TyGroup(mask);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
template <typename TyGroup>
|
| 142 |
+
_CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
|
| 143 |
+
group._data.coalesced.metaGroupRank = mgRank;
|
| 144 |
+
group._data.coalesced.metaGroupSize = mgSize;
|
| 145 |
+
}
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
namespace tile {
|
| 149 |
+
template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
|
| 150 |
+
struct _tile_helpers{
|
| 151 |
+
_CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
|
| 152 |
+
_CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
|
| 153 |
+
_CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
|
| 154 |
+
_CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
|
| 155 |
+
};
|
| 156 |
+
|
| 157 |
+
template <unsigned int> struct tile_helpers;
|
| 158 |
+
template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {};
|
| 159 |
+
template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {};
|
| 160 |
+
template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {};
|
| 161 |
+
template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {};
|
| 162 |
+
template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
|
| 163 |
+
template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
|
| 164 |
+
|
| 165 |
+
#ifdef _CG_CPP11_FEATURES
|
| 166 |
+
namespace shfl {
|
| 167 |
+
/***********************************************************************************
|
| 168 |
+
* Recursively Sliced Shuffle
|
| 169 |
+
* Purpose:
|
| 170 |
+
* Slices an input type a number of times into integral types so that shuffles
|
| 171 |
+
* are well defined
|
| 172 |
+
* Expectations:
|
| 173 |
+
* This object *should not* be used from a reinterpret_cast pointer unless
|
| 174 |
+
* some alignment guarantees can be met. Use a memcpy to guarantee that loads
|
| 175 |
+
* from the integral types stored within are aligned and correct.
|
| 176 |
+
**********************************************************************************/
|
| 177 |
+
template <unsigned int count, bool intSized = (count <= sizeof(int))>
|
| 178 |
+
struct recursive_sliced_shuffle_helper;
|
| 179 |
+
|
| 180 |
+
template <unsigned int count>
|
| 181 |
+
struct recursive_sliced_shuffle_helper<count, true> {
|
| 182 |
+
int val;
|
| 183 |
+
|
| 184 |
+
template <typename TyFn>
|
| 185 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 186 |
+
val = shfl(val);
|
| 187 |
+
}
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
template <unsigned int count>
|
| 191 |
+
struct recursive_sliced_shuffle_helper<count, false> {
|
| 192 |
+
int val;
|
| 193 |
+
recursive_sliced_shuffle_helper<count - sizeof(int)> next;
|
| 194 |
+
|
| 195 |
+
template <typename TyFn>
|
| 196 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 197 |
+
val = shfl(val);
|
| 198 |
+
next.invoke_shuffle(shfl);
|
| 199 |
+
}
|
| 200 |
+
};
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
struct _memory_shuffle {
|
| 204 |
+
template <typename TyElem, typename TyShflFn>
|
| 205 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 206 |
+
static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B");
|
| 207 |
+
return TyElem{};
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 211 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 212 |
+
auto shfl = [=](int val) -> int {
|
| 213 |
+
return 0;
|
| 214 |
+
};
|
| 215 |
+
|
| 216 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 220 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 221 |
+
auto shfl = [=](int val) -> int {
|
| 222 |
+
return 0;
|
| 223 |
+
};
|
| 224 |
+
|
| 225 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 229 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 230 |
+
auto shfl = [=](int val) -> int {
|
| 231 |
+
return 0;
|
| 232 |
+
};
|
| 233 |
+
|
| 234 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 238 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 239 |
+
auto shfl = [=](int val) -> int {
|
| 240 |
+
return 0;
|
| 241 |
+
};
|
| 242 |
+
|
| 243 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 244 |
+
}
|
| 245 |
+
};
|
| 246 |
+
|
| 247 |
+
/***********************************************************************************
|
| 248 |
+
* Intrinsic Device Function Shuffle
|
| 249 |
+
* Purpose:
|
| 250 |
+
* Uses a shuffle helper that has characteristics best suited for moving
|
| 251 |
+
* elements between threads
|
| 252 |
+
* Expectations:
|
| 253 |
+
* Object given will be forced into an l-value type so that it can be used
|
| 254 |
+
* with a helper structure that reinterprets the data into intrinsic compatible
|
| 255 |
+
* types
|
| 256 |
+
* Notes:
|
| 257 |
+
* !! TyRet is required so that objects are returned by value and not as
|
| 258 |
+
* dangling references depending on the value category of the passed object
|
| 259 |
+
**********************************************************************************/
|
| 260 |
+
struct _intrinsic_compat_shuffle {
|
| 261 |
+
template <unsigned int count>
|
| 262 |
+
using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
|
| 263 |
+
|
| 264 |
+
template <typename TyElem, typename TyShflFn>
|
| 265 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 266 |
+
static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
|
| 267 |
+
shfl_helper<sizeof(TyElem)> helper;
|
| 268 |
+
memcpy(&helper, &elem, sizeof(TyElem));
|
| 269 |
+
helper.invoke_shuffle(fn);
|
| 270 |
+
memcpy(&elem, &helper, sizeof(TyElem));
|
| 271 |
+
return elem;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 275 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 276 |
+
auto shfl = [=](int val) -> int {
|
| 277 |
+
return __shfl_sync(gMask, val, srcRank, threads);
|
| 278 |
+
};
|
| 279 |
+
|
| 280 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 284 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 285 |
+
auto shfl = [=](int val) -> int {
|
| 286 |
+
return __shfl_down_sync(gMask, val, delta, threads);
|
| 287 |
+
};
|
| 288 |
+
|
| 289 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 293 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 294 |
+
auto shfl = [=](int val) -> int {
|
| 295 |
+
return __shfl_up_sync(gMask, val, delta, threads);
|
| 296 |
+
};
|
| 297 |
+
|
| 298 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 302 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 303 |
+
auto shfl = [=](int val) -> int {
|
| 304 |
+
return __shfl_xor_sync(gMask, val, lMask, threads);
|
| 305 |
+
};
|
| 306 |
+
|
| 307 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 308 |
+
}
|
| 309 |
+
};
|
| 310 |
+
|
| 311 |
+
struct _native_shuffle {
|
| 312 |
+
template <typename TyElem>
|
| 313 |
+
_CG_STATIC_QUALIFIER TyElem shfl(
|
| 314 |
+
TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 315 |
+
return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
template <typename TyElem>
|
| 319 |
+
_CG_STATIC_QUALIFIER TyElem shfl_down(
|
| 320 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 321 |
+
return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
template <typename TyElem>
|
| 325 |
+
_CG_STATIC_QUALIFIER TyElem shfl_up(
|
| 326 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 327 |
+
return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
template <typename TyElem>
|
| 331 |
+
_CG_STATIC_QUALIFIER TyElem shfl_xor(
|
| 332 |
+
TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 333 |
+
return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
|
| 334 |
+
}
|
| 335 |
+
};
|
| 336 |
+
|
| 337 |
+
// Almost all arithmetic types are supported by native shuffle
|
| 338 |
+
// Vector types are the exception
|
| 339 |
+
template <typename TyElem>
|
| 340 |
+
using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
|
| 341 |
+
bool,
|
| 342 |
+
_CG_STL_NAMESPACE::is_integral<
|
| 343 |
+
remove_qual<TyElem>>::value ||
|
| 344 |
+
details::is_float_or_half<
|
| 345 |
+
remove_qual<TyElem>>::value
|
| 346 |
+
>;
|
| 347 |
+
|
| 348 |
+
constexpr unsigned long long _MemoryShuffleCutoff = 32;
|
| 349 |
+
|
| 350 |
+
template <typename TyElem,
|
| 351 |
+
bool IsNative = use_native_shuffle<TyElem>::value,
|
| 352 |
+
bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
|
| 353 |
+
struct shuffle_dispatch;
|
| 354 |
+
|
| 355 |
+
template <typename TyElem>
|
| 356 |
+
struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {};
|
| 357 |
+
|
| 358 |
+
template <typename TyElem>
|
| 359 |
+
struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
|
| 360 |
+
|
| 361 |
+
template <typename TyElem>
|
| 362 |
+
struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {};
|
| 363 |
+
|
| 364 |
+
#endif //_CG_CPP11_FEATURES
|
| 365 |
+
};
|
| 366 |
+
|
| 367 |
+
namespace multi_grid {
|
| 368 |
+
struct multi_grid_functions;
|
| 369 |
+
};
|
| 370 |
+
|
| 371 |
+
namespace grid {
|
| 372 |
+
_CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) {
|
| 373 |
+
return details::sync_grids_arrive(bar);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
_CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) {
|
| 377 |
+
details::sync_grids_wait(token, bar);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
_CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
|
| 381 |
+
unsigned int token = details::sync_grids_arrive(bar);
|
| 382 |
+
details::sync_grids_wait(token, bar);
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks()
|
| 386 |
+
{
|
| 387 |
+
// grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
|
| 388 |
+
// grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication
|
| 389 |
+
return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads()
|
| 393 |
+
{
|
| 394 |
+
return num_blocks() * cta::num_threads();
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank()
|
| 398 |
+
{
|
| 399 |
+
return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank()
|
| 403 |
+
{
|
| 404 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 408 |
+
{
|
| 409 |
+
return dim3(gridDim.x, gridDim.y, gridDim.z);
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 413 |
+
{
|
| 414 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 418 |
+
{
|
| 419 |
+
return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z);
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 423 |
+
{
|
| 424 |
+
return dim3(blockIdx.x * blockDim.x + threadIdx.x,
|
| 425 |
+
blockIdx.y * blockDim.y + threadIdx.y,
|
| 426 |
+
blockIdx.z * blockDim.z + threadIdx.z);
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 430 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 431 |
+
return __clusterGridDimInClusters();
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 435 |
+
const dim3 dimClusters = dim_clusters();
|
| 436 |
+
return dimClusters.x * dimClusters.y * dimClusters.z;
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 440 |
+
return __clusterIdx();
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 444 |
+
return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
|
| 445 |
+
}
|
| 446 |
+
#endif
|
| 447 |
+
|
| 448 |
+
// Legacy aliases
|
| 449 |
+
_CG_STATIC_QUALIFIER unsigned long long size()
|
| 450 |
+
{
|
| 451 |
+
return num_threads();
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
_CG_STATIC_QUALIFIER dim3 grid_dim()
|
| 455 |
+
{
|
| 456 |
+
return dim_blocks();
|
| 457 |
+
}
|
| 458 |
+
};
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 462 |
+
|
| 463 |
+
namespace multi_grid {
|
| 464 |
+
_CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
|
| 465 |
+
{
|
| 466 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 467 |
+
//this function is defined in device runtime library
|
| 468 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 469 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 470 |
+
return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
|
| 471 |
+
#else /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */
|
| 472 |
+
return 0;
|
| 473 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
_CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
|
| 477 |
+
{
|
| 478 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 479 |
+
//this function is defined in device runtime library
|
| 480 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 481 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 482 |
+
cudaError_t err = cudaCGSynchronize(handle, 0);
|
| 483 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
_CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
|
| 487 |
+
{
|
| 488 |
+
unsigned int numThreads = 0;
|
| 489 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 490 |
+
//this function is defined in device runtime library
|
| 491 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 492 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 493 |
+
cudaCGGetSize(&numThreads, NULL, handle);
|
| 494 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 495 |
+
return numThreads;
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
|
| 499 |
+
{
|
| 500 |
+
unsigned int threadRank = 0;
|
| 501 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 502 |
+
//this function is defined in device runtime library
|
| 503 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 504 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 505 |
+
cudaCGGetRank(&threadRank, NULL, handle);
|
| 506 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 507 |
+
return threadRank;
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
_CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
|
| 511 |
+
{
|
| 512 |
+
unsigned int gridRank = 0;
|
| 513 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 514 |
+
//this function is defined in device runtime library
|
| 515 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 516 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 517 |
+
cudaCGGetRank(NULL, &gridRank, handle);
|
| 518 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 519 |
+
return gridRank;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
_CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
|
| 523 |
+
{
|
| 524 |
+
unsigned int numGrids = 0;
|
| 525 |
+
#if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
|
| 526 |
+
//this function is defined in device runtime library
|
| 527 |
+
//which requires separate compilation mode (__CUDACC_RDC__)
|
| 528 |
+
//or extended whole program mode (__CUDACC_EWP__)
|
| 529 |
+
cudaCGGetSize(NULL, &numGrids, handle);
|
| 530 |
+
#endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
|
| 531 |
+
return numGrids;
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
# ifdef _CG_CPP11_FEATURES
|
| 535 |
+
struct multi_grid_functions {
|
| 536 |
+
decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
|
| 537 |
+
decltype(multi_grid::sync) *sync;
|
| 538 |
+
decltype(multi_grid::size) *size;
|
| 539 |
+
decltype(multi_grid::thread_rank) *thread_rank;
|
| 540 |
+
decltype(multi_grid::grid_rank) *grid_rank;
|
| 541 |
+
decltype(multi_grid::num_grids) *num_grids;
|
| 542 |
+
};
|
| 543 |
+
|
| 544 |
+
template <typename = void>
|
| 545 |
+
_CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
|
| 546 |
+
__constant__ static const multi_grid_functions mgf {
|
| 547 |
+
&multi_grid::get_intrinsic_handle,
|
| 548 |
+
&multi_grid::sync,
|
| 549 |
+
&multi_grid::size,
|
| 550 |
+
&multi_grid::thread_rank,
|
| 551 |
+
&multi_grid::grid_rank,
|
| 552 |
+
&multi_grid::num_grids
|
| 553 |
+
};
|
| 554 |
+
|
| 555 |
+
return &mgf;
|
| 556 |
+
}
|
| 557 |
+
# endif
|
| 558 |
+
};
|
| 559 |
+
#endif
|
| 560 |
+
|
| 561 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 562 |
+
namespace cluster {
|
| 563 |
+
|
| 564 |
+
_CG_STATIC_QUALIFIER bool isReal()
|
| 565 |
+
{
|
| 566 |
+
return __clusterDimIsSpecified();
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
_CG_STATIC_QUALIFIER void barrier_arrive()
|
| 570 |
+
{
|
| 571 |
+
__cluster_barrier_arrive();
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 575 |
+
{
|
| 576 |
+
__cluster_barrier_wait();
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 580 |
+
{
|
| 581 |
+
barrier_arrive();
|
| 582 |
+
barrier_wait();
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 586 |
+
{
|
| 587 |
+
return __cluster_query_shared_rank(addr);
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
template <typename T>
|
| 591 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 592 |
+
{
|
| 593 |
+
return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 597 |
+
{
|
| 598 |
+
return __clusterRelativeBlockIdx();
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 602 |
+
{
|
| 603 |
+
return __clusterRelativeBlockRank();
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 607 |
+
{
|
| 608 |
+
const dim3 blockIndex = block_index();
|
| 609 |
+
return dim3(blockIndex.x * blockDim.x + threadIdx.x,
|
| 610 |
+
blockIndex.y * blockDim.y + threadIdx.y,
|
| 611 |
+
blockIndex.z * blockDim.z + threadIdx.z);
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 615 |
+
{
|
| 616 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 620 |
+
{
|
| 621 |
+
return __clusterDim();
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 625 |
+
{
|
| 626 |
+
return __clusterSizeInBlocks();
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 630 |
+
{
|
| 631 |
+
const dim3 dimBlocks = dim_blocks();
|
| 632 |
+
const unsigned int x = dimBlocks.x * blockDim.x;
|
| 633 |
+
const unsigned int y = dimBlocks.y * blockDim.y;
|
| 634 |
+
const unsigned int z = dimBlocks.z * blockDim.z;
|
| 635 |
+
return dim3(x, y, z);
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 639 |
+
{
|
| 640 |
+
return num_blocks() * cta::num_threads();
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
+
};
|
| 644 |
+
#endif
|
| 645 |
+
|
| 646 |
+
_CG_STATIC_QUALIFIER unsigned int laneid()
|
| 647 |
+
{
|
| 648 |
+
unsigned int laneid;
|
| 649 |
+
asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
|
| 650 |
+
return laneid;
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
|
| 654 |
+
{
|
| 655 |
+
unsigned int lanemask32_eq;
|
| 656 |
+
asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
|
| 657 |
+
return (lanemask32_eq);
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
|
| 661 |
+
{
|
| 662 |
+
unsigned int lanemask32_lt;
|
| 663 |
+
asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
|
| 664 |
+
return (lanemask32_lt);
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
+
_CG_STATIC_QUALIFIER void abort()
|
| 668 |
+
{
|
| 669 |
+
_CG_ABORT();
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
template <typename Ty>
|
| 673 |
+
_CG_QUALIFIER void assert_if_not_arithmetic() {
|
| 674 |
+
#ifdef _CG_CPP11_FEATURES
|
| 675 |
+
static_assert(
|
| 676 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value ||
|
| 677 |
+
details::is_float_or_half<Ty>::value,
|
| 678 |
+
"Error: Ty is neither integer or float"
|
| 679 |
+
);
|
| 680 |
+
#endif //_CG_CPP11_FEATURES
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
#ifdef _CG_CPP11_FEATURES
|
| 684 |
+
_CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
|
| 685 |
+
return x == 1 ? 0 : 1 + log2(x / 2);
|
| 686 |
+
}
|
| 687 |
+
#endif //_CG_CPP11_FEATURES
|
| 688 |
+
|
| 689 |
+
}; // !Namespace internal
|
| 690 |
+
|
| 691 |
+
_CG_END_NAMESPACE
|
| 692 |
+
|
| 693 |
+
#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_MEMORY_H_
|
| 50 |
+
# define _COOPERATIVE_GROUPS_MEMORY_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
|
| 54 |
+
_CG_BEGIN_NAMESPACE
|
| 55 |
+
|
| 56 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 57 |
+
namespace details {
|
| 58 |
+
_CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
|
| 59 |
+
|
| 60 |
+
#if defined(_CG_HAS_RESERVED_SHARED)
|
| 61 |
+
_CG_STATIC_QUALIFIER void* reserved_shared_ptr()
|
| 62 |
+
{
|
| 63 |
+
void *ptr;
|
| 64 |
+
asm ("{\n\t"
|
| 65 |
+
" .reg .u32 start;\n\t"
|
| 66 |
+
" .reg .u64 extended;\n\t"
|
| 67 |
+
" mov.u32 start, %%reserved_smem_offset_1;\n\t"
|
| 68 |
+
" cvt.u64.u32 extended, start;\n\t"
|
| 69 |
+
" cvta.shared.u64 %0, extended;\n\t"
|
| 70 |
+
"}"
|
| 71 |
+
: "=" _CG_ASM_PTR_CONSTRAINT(ptr));
|
| 72 |
+
return ptr;
|
| 73 |
+
}
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
struct multi_warp_scratch {
|
| 77 |
+
// One barrier per possible size of the group.
|
| 78 |
+
_CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
|
| 79 |
+
_CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
|
| 80 |
+
|
| 81 |
+
using communication_type = unsigned long long;
|
| 82 |
+
_CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
|
| 83 |
+
|
| 84 |
+
// Layout of the scratch space:
|
| 85 |
+
barrier_t barriers[memory_barriers_count];
|
| 86 |
+
char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
|
| 87 |
+
communication_type communication_memory[default_max_block_size / 32];
|
| 88 |
+
|
| 89 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
|
| 90 |
+
// One slot of collectives memory per warp.
|
| 91 |
+
return scratch_num_reserved_bytes + sync_memory_size + max_block_size / 32 * communication_size;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
_CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
|
| 95 |
+
if (thread_rank < memory_barriers_count) {
|
| 96 |
+
barriers[thread_rank] = 0;
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
};
|
| 100 |
+
|
| 101 |
+
#if defined(_CG_HAS_RESERVED_SHARED)
|
| 102 |
+
// CG can expect at least 288 bytes available in reserved shared
|
| 103 |
+
static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
|
| 104 |
+
#endif
|
| 105 |
+
|
| 106 |
+
// Make sure the structure can fit into the user provided memory
|
| 107 |
+
static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
|
| 108 |
+
"multi-warp scratch size is too large");
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
_CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
|
| 112 |
+
void *ptr;
|
| 113 |
+
#if defined(_CG_HAS_RESERVED_SHARED)
|
| 114 |
+
ptr = reserved_shared_ptr();
|
| 115 |
+
#else
|
| 116 |
+
ptr = user_scratch;
|
| 117 |
+
#endif
|
| 118 |
+
return static_cast<multi_warp_scratch*>(ptr);
|
| 119 |
+
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
template <unsigned int MaxBlockSize = details::default_max_block_size>
|
| 125 |
+
struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
|
| 126 |
+
private:
|
| 127 |
+
#if !defined(_CG_HAS_RESERVED_SHARED)
|
| 128 |
+
char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
|
| 129 |
+
#endif
|
| 130 |
+
};
|
| 131 |
+
#endif
|
| 132 |
+
|
| 133 |
+
_CG_END_NAMESPACE
|
| 134 |
+
|
| 135 |
+
#endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_REDUCE_H_
|
| 50 |
+
#define _CG_REDUCE_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "coalesced_reduce.h"
|
| 55 |
+
#include "functional.h"
|
| 56 |
+
#include "cooperative_groups.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <class Ty>
|
| 63 |
+
using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
|
| 64 |
+
bool,
|
| 65 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
|
| 66 |
+
|
| 67 |
+
template <class Ty>
|
| 68 |
+
using redux_is_add_supported = _redux_is_add_supported<Ty>;
|
| 69 |
+
|
| 70 |
+
// A specialization for 64 bit logical operations is possible
|
| 71 |
+
// but for now only accelerate 32 bit bitwise ops
|
| 72 |
+
template <class Ty>
|
| 73 |
+
using redux_is_logical_supported = redux_is_add_supported<Ty>;
|
| 74 |
+
|
| 75 |
+
// Base operator support case
|
| 76 |
+
template <class TyOp, class Ty> struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 77 |
+
#ifdef _CG_HAS_OP_REDUX
|
| 78 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 79 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 80 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 81 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 82 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 83 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 84 |
+
#endif
|
| 85 |
+
|
| 86 |
+
template <class Ty, template <class> class TyOp>
|
| 87 |
+
using redux_op_supported = _redux_op_supported<
|
| 88 |
+
typename details::remove_qual<TyOp<Ty>>,
|
| 89 |
+
Ty>;
|
| 90 |
+
|
| 91 |
+
// Groups smaller than 16 actually have worse performance characteristics when used with redux
|
| 92 |
+
// tiles of size 16 and 32 perform the same or better and have better code generation profiles
|
| 93 |
+
template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
|
| 94 |
+
|
| 95 |
+
template <unsigned int Sz, typename TyPar>
|
| 96 |
+
struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
|
| 97 |
+
bool,
|
| 98 |
+
(Sz >= 16)> {};
|
| 99 |
+
template <unsigned int Sz, typename TyPar>
|
| 100 |
+
struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
|
| 101 |
+
bool,
|
| 102 |
+
(Sz >= 16)> {};
|
| 103 |
+
template <>
|
| 104 |
+
struct _redux_group_optimized<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 105 |
+
|
| 106 |
+
template <typename TyGroup>
|
| 107 |
+
using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
|
| 108 |
+
|
| 109 |
+
template <template <class> class TyOp>
|
| 110 |
+
_CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
|
| 111 |
+
template <template <class> class TyOp>
|
| 112 |
+
_CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
|
| 113 |
+
|
| 114 |
+
#ifdef _CG_HAS_OP_REDUX
|
| 115 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
|
| 116 |
+
return __reduce_add_sync(mask, val);
|
| 117 |
+
}
|
| 118 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
|
| 119 |
+
return __reduce_min_sync(mask, val);
|
| 120 |
+
}
|
| 121 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
|
| 122 |
+
return __reduce_max_sync(mask, val);
|
| 123 |
+
}
|
| 124 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
|
| 125 |
+
return __reduce_and_sync(mask, val);
|
| 126 |
+
}
|
| 127 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
|
| 128 |
+
return __reduce_xor_sync(mask, val);
|
| 129 |
+
}
|
| 130 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
|
| 131 |
+
return __reduce_or_sync(mask, val);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
|
| 135 |
+
return __reduce_add_sync(mask, val);
|
| 136 |
+
}
|
| 137 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
|
| 138 |
+
return __reduce_min_sync(mask, val);
|
| 139 |
+
}
|
| 140 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
|
| 141 |
+
return __reduce_max_sync(mask, val);
|
| 142 |
+
}
|
| 143 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
|
| 144 |
+
return __reduce_and_sync(mask, val);
|
| 145 |
+
}
|
| 146 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
|
| 147 |
+
return __reduce_xor_sync(mask, val);
|
| 148 |
+
}
|
| 149 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
|
| 150 |
+
return __reduce_or_sync(mask, val);
|
| 151 |
+
}
|
| 152 |
+
#endif
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
|
| 156 |
+
struct _accelerated_op;
|
| 157 |
+
|
| 158 |
+
// Signed type redux intrinsic dispatch
|
| 159 |
+
template <typename TyVal>
|
| 160 |
+
struct _accelerated_op<TyVal, false> {
|
| 161 |
+
template <template <class> class TyOp>
|
| 162 |
+
_CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
|
| 163 |
+
return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
|
| 164 |
+
}
|
| 165 |
+
};
|
| 166 |
+
|
| 167 |
+
// Unsigned type redux intrinsic dispatch
|
| 168 |
+
template <typename TyVal>
|
| 169 |
+
struct _accelerated_op<TyVal, true> {
|
| 170 |
+
template <template <class> class TyOp>
|
| 171 |
+
_CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
|
| 172 |
+
return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
|
| 173 |
+
}
|
| 174 |
+
};
|
| 175 |
+
|
| 176 |
+
template <typename TyVal>
|
| 177 |
+
using accelerated_op = _accelerated_op<TyVal>;
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
template <typename TyVal, typename TyFnInput, typename TyGroup>
|
| 181 |
+
class _redux_dispatch {
|
| 182 |
+
template <class Ty, template <class> class TyOp>
|
| 183 |
+
using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
|
| 184 |
+
redux_op_supported<Ty, TyOp>::value &&
|
| 185 |
+
redux_group_optimized<TyGroup>::value>;
|
| 186 |
+
|
| 187 |
+
template <class Ty, template <class> class TyOp>
|
| 188 |
+
using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
|
| 189 |
+
|
| 190 |
+
template <class Ty, template <class> class TyOp>
|
| 191 |
+
using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
|
| 192 |
+
|
| 193 |
+
public:
|
| 194 |
+
// Dispatch to redux if the combination of op and args are supported
|
| 195 |
+
template<
|
| 196 |
+
template <class> class TyOp,
|
| 197 |
+
redux_is_usable<TyFnInput, TyOp> = nullptr>
|
| 198 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 199 |
+
// Retrieve the mask for the group and dispatch to redux
|
| 200 |
+
return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
template<
|
| 204 |
+
template <class> class TyOp,
|
| 205 |
+
redux_is_usable<TyFnInput, TyOp> = nullptr>
|
| 206 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
|
| 207 |
+
// Retrieve the mask for the group and dispatch to redux
|
| 208 |
+
return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
// Fallback shuffle sync reduction
|
| 212 |
+
template <
|
| 213 |
+
template <class> class TyOp,
|
| 214 |
+
redux_is_not_usable<TyFnInput, TyOp> = nullptr>
|
| 215 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 216 |
+
//Dispatch to fallback shuffle sync accelerated reduction
|
| 217 |
+
return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
};
|
| 221 |
+
|
| 222 |
+
// Group support for reduce.
|
| 223 |
+
template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 224 |
+
|
| 225 |
+
template <unsigned int Sz, typename TyPar>
|
| 226 |
+
struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 227 |
+
template <unsigned int Sz, typename TyPar>
|
| 228 |
+
struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 229 |
+
template <>
|
| 230 |
+
struct _reduce_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 231 |
+
|
| 232 |
+
template <typename TyGroup>
|
| 233 |
+
using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
|
| 234 |
+
|
| 235 |
+
template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
|
| 236 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 237 |
+
static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
|
| 238 |
+
|
| 239 |
+
using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
|
| 240 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
|
| 244 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
|
| 245 |
+
static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
|
| 246 |
+
|
| 247 |
+
using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
|
| 248 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
template <typename TyVal, typename TyOp, typename TyGroup>
|
| 253 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 254 |
+
return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
template <unsigned int GroupId>
|
| 258 |
+
struct tile_reduce_dispatch;
|
| 259 |
+
|
| 260 |
+
template <>
|
| 261 |
+
struct tile_reduce_dispatch<details::coalesced_group_id> {
|
| 262 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 263 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 264 |
+
return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 265 |
+
}
|
| 266 |
+
};
|
| 267 |
+
|
| 268 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 269 |
+
template <>
|
| 270 |
+
struct tile_reduce_dispatch<details::multi_tile_group_id> {
|
| 271 |
+
template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
|
| 272 |
+
_CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 273 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 274 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 275 |
+
const unsigned int num_warps = Size / 32;
|
| 276 |
+
|
| 277 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 278 |
+
*warp_scratch_location =
|
| 279 |
+
details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 280 |
+
};
|
| 281 |
+
auto inter_warp_lambda =
|
| 282 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 283 |
+
*thread_scratch_location =
|
| 284 |
+
details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 285 |
+
};
|
| 286 |
+
return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 287 |
+
}
|
| 288 |
+
};
|
| 289 |
+
|
| 290 |
+
template <unsigned int GroupId>
|
| 291 |
+
struct tile_async_reduce_dispatch;
|
| 292 |
+
|
| 293 |
+
template <>
|
| 294 |
+
struct tile_async_reduce_dispatch<details::coalesced_group_id> {
|
| 295 |
+
template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
|
| 296 |
+
_CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
|
| 297 |
+
// Do regular, in group reduction
|
| 298 |
+
auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 299 |
+
|
| 300 |
+
// One thread stores/updates the destination
|
| 301 |
+
if (group.thread_rank() == 0) {
|
| 302 |
+
res_handler(result);
|
| 303 |
+
}
|
| 304 |
+
}
|
| 305 |
+
};
|
| 306 |
+
|
| 307 |
+
template <>
|
| 308 |
+
struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
|
| 309 |
+
template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
|
| 310 |
+
_CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
|
| 311 |
+
using TyVal = remove_qual<TyInputVal>;
|
| 312 |
+
const unsigned int num_warps = TySize / 32;
|
| 313 |
+
details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
|
| 314 |
+
auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
|
| 315 |
+
|
| 316 |
+
// Do in warp reduce
|
| 317 |
+
auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
|
| 318 |
+
*warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
|
| 319 |
+
|
| 320 |
+
// Tile of size num_warps from the last warp to arrive does final reduction step
|
| 321 |
+
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
|
| 322 |
+
auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
|
| 323 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 324 |
+
auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
|
| 325 |
+
auto thread_val = *thread_scratch_location;
|
| 326 |
+
// Release other warps, we read their contribution already.
|
| 327 |
+
subwarp.sync();
|
| 328 |
+
details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
|
| 329 |
+
TyVal result = details::reduce(subwarp, thread_val, op);
|
| 330 |
+
// One thread stores the result or updates the atomic
|
| 331 |
+
if (subwarp.thread_rank() == 0) {
|
| 332 |
+
res_handler(result);
|
| 333 |
+
}
|
| 334 |
+
}
|
| 335 |
+
warp.sync();
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
};
|
| 339 |
+
#endif
|
| 340 |
+
|
| 341 |
+
template <typename TyGroup, typename TyInputVal, typename TyRetVal>
|
| 342 |
+
_CG_QUALIFIER void check_reduce_params() {
|
| 343 |
+
static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
|
| 344 |
+
static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
|
| 345 |
+
};
|
| 346 |
+
|
| 347 |
+
template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
|
| 348 |
+
_CG_QUALIFIER void check_async_reduce_params() {
|
| 349 |
+
check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
|
| 350 |
+
static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
|
| 351 |
+
}
|
| 352 |
+
} // details
|
| 353 |
+
|
| 354 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 355 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 356 |
+
details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
|
| 357 |
+
|
| 358 |
+
using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
|
| 359 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 363 |
+
|
| 364 |
+
# if defined(_CG_HAS_STL_ATOMICS)
|
| 365 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 366 |
+
void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 367 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 368 |
+
auto update_lambda = [&] (TyVal& result) {
|
| 369 |
+
details::atomic_update(dst, result, op);
|
| 370 |
+
};
|
| 371 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 372 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 376 |
+
void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 377 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 378 |
+
auto update_lambda = [&] (TyVal& result) {
|
| 379 |
+
details::atomic_update(dst, result, op);
|
| 380 |
+
};
|
| 381 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 382 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 386 |
+
void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 387 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 388 |
+
auto store_lambda = [&] (TyVal& result) {
|
| 389 |
+
details::atomic_store(dst, result);
|
| 390 |
+
};
|
| 391 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 392 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 396 |
+
void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 397 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 398 |
+
auto store_lambda = [&] (TyVal& result) {
|
| 399 |
+
details::atomic_store(dst, result);
|
| 400 |
+
};
|
| 401 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 402 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
|
| 403 |
+
}
|
| 404 |
+
# endif
|
| 405 |
+
|
| 406 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
|
| 407 |
+
void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
|
| 408 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 409 |
+
auto store_lambda = [&] (TyVal& result) {
|
| 410 |
+
*dst = result;
|
| 411 |
+
};
|
| 412 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
|
| 413 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
|
| 414 |
+
}
|
| 415 |
+
#endif
|
| 416 |
+
|
| 417 |
+
_CG_END_NAMESPACE
|
| 418 |
+
|
| 419 |
+
#endif // _CG_REDUCE_H_
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_SCAN_H_
|
| 50 |
+
#define _CG_SCAN_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "functional.h"
|
| 55 |
+
#include "coalesced_scan.h"
|
| 56 |
+
|
| 57 |
+
_CG_BEGIN_NAMESPACE
|
| 58 |
+
|
| 59 |
+
namespace details {
|
| 60 |
+
|
| 61 |
+
// Group support for scan.
|
| 62 |
+
template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 63 |
+
|
| 64 |
+
template <unsigned int Sz, typename TyPar>
|
| 65 |
+
struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 66 |
+
template <unsigned int Sz, typename TyPar>
|
| 67 |
+
struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 68 |
+
template <>
|
| 69 |
+
struct _scan_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 70 |
+
|
| 71 |
+
template <typename TyGroup>
|
| 72 |
+
using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
|
| 73 |
+
|
| 74 |
+
template <bool IsIntegralPlus>
|
| 75 |
+
struct integral_optimized_scan;
|
| 76 |
+
|
| 77 |
+
enum class ScanType { exclusive, inclusive };
|
| 78 |
+
|
| 79 |
+
template <unsigned int GroupId, ScanType TyScan>
|
| 80 |
+
struct scan_dispatch;
|
| 81 |
+
|
| 82 |
+
template <ScanType TyScan>
|
| 83 |
+
struct scan_dispatch<details::coalesced_group_id, TyScan> {
|
| 84 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 85 |
+
_CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 86 |
+
auto scan_result = coalesced_inclusive_scan(group, val, op);
|
| 87 |
+
if (TyScan == ScanType::exclusive) {
|
| 88 |
+
scan_result = convert_inclusive_to_exclusive(group,
|
| 89 |
+
scan_result,
|
| 90 |
+
_CG_STL_NAMESPACE::forward<TyVal>(val),
|
| 91 |
+
_CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 92 |
+
}
|
| 93 |
+
return scan_result;
|
| 94 |
+
}
|
| 95 |
+
};
|
| 96 |
+
|
| 97 |
+
#if defined(_CG_CPP11_FEATURES)
|
| 98 |
+
template <ScanType TyScan>
|
| 99 |
+
struct scan_dispatch<details::multi_tile_group_id, TyScan> {
|
| 100 |
+
template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
|
| 101 |
+
_CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 102 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 103 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 104 |
+
const unsigned int num_warps = Size / 32;
|
| 105 |
+
// In warp scan result, calculated in warp_lambda
|
| 106 |
+
TyRet warp_scan;
|
| 107 |
+
|
| 108 |
+
// In warp scan, put sum in the warp_scratch_location
|
| 109 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 110 |
+
warp_scan =
|
| 111 |
+
details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 112 |
+
if (warp.thread_rank() + 1 == warp.size()) {
|
| 113 |
+
*warp_scratch_location = warp_scan;
|
| 114 |
+
}
|
| 115 |
+
if (TyScan == ScanType::exclusive) {
|
| 116 |
+
warp_scan = warp.shfl_up(warp_scan, 1);
|
| 117 |
+
}
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
// Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
|
| 121 |
+
// to its in-warp scan result
|
| 122 |
+
auto inter_warp_lambda =
|
| 123 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 124 |
+
auto thread_val = *thread_scratch_location;
|
| 125 |
+
auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
|
| 126 |
+
*thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
|
| 127 |
+
};
|
| 128 |
+
|
| 129 |
+
TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 130 |
+
if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
|
| 131 |
+
return previous_warps_sum;
|
| 132 |
+
}
|
| 133 |
+
if (warpType::meta_group_rank() == 0) {
|
| 134 |
+
return warp_scan;
|
| 135 |
+
}
|
| 136 |
+
else {
|
| 137 |
+
return op(warp_scan, previous_warps_sum);
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 143 |
+
template <unsigned int GroupId, ScanType TyScan>
|
| 144 |
+
struct scan_update_dispatch;
|
| 145 |
+
|
| 146 |
+
template <ScanType TyScan>
|
| 147 |
+
struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
|
| 148 |
+
template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
|
| 149 |
+
_CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 150 |
+
details::remove_qual<TyVal> old;
|
| 151 |
+
|
| 152 |
+
// Do regular in group scan
|
| 153 |
+
auto scan_result = details::coalesced_inclusive_scan(group, val, op);
|
| 154 |
+
|
| 155 |
+
// Last thread updates the atomic and distributes its old value to other threads
|
| 156 |
+
if (group.thread_rank() == group.size() - 1) {
|
| 157 |
+
old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 158 |
+
}
|
| 159 |
+
old = group.shfl(old, group.size() - 1);
|
| 160 |
+
if (TyScan == ScanType::exclusive) {
|
| 161 |
+
scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 162 |
+
}
|
| 163 |
+
scan_result = op(old, scan_result);
|
| 164 |
+
return scan_result;
|
| 165 |
+
}
|
| 166 |
+
};
|
| 167 |
+
|
| 168 |
+
template <ScanType TyScan>
|
| 169 |
+
struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
|
| 170 |
+
template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
|
| 171 |
+
_CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 172 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 173 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 174 |
+
const unsigned int num_warps = Size / 32;
|
| 175 |
+
// In warp scan result, calculated in warp_lambda
|
| 176 |
+
TyRet warp_scan;
|
| 177 |
+
|
| 178 |
+
// In warp scan, put sum in the warp_scratch_location
|
| 179 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 180 |
+
warp_scan =
|
| 181 |
+
details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 182 |
+
if (warp.thread_rank() + 1 == warp.size()) {
|
| 183 |
+
*warp_scratch_location = warp_scan;
|
| 184 |
+
}
|
| 185 |
+
if (TyScan == ScanType::exclusive) {
|
| 186 |
+
warp_scan = warp.shfl_up(warp_scan, 1);
|
| 187 |
+
}
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
// Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
|
| 191 |
+
// to its in-warp scan result
|
| 192 |
+
auto inter_warp_lambda =
|
| 193 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 194 |
+
auto thread_val = *thread_scratch_location;
|
| 195 |
+
auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
|
| 196 |
+
TyRet offset;
|
| 197 |
+
// Single thread does the atomic update with sum of all contributions and reads the old value.
|
| 198 |
+
if (subwarp.thread_rank() == subwarp.size() - 1) {
|
| 199 |
+
offset = details::atomic_update(dst, scan_result, op);
|
| 200 |
+
}
|
| 201 |
+
offset = subwarp.shfl(offset, subwarp.size() - 1);
|
| 202 |
+
scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
|
| 203 |
+
// Add offset read from the atomic to the scanned warp sum.
|
| 204 |
+
// Skipping first thread, since it got defautly constructed value from the conversion,
|
| 205 |
+
// it should just return the offset received from the thread that did the atomic update.
|
| 206 |
+
if (subwarp.thread_rank() != 0) {
|
| 207 |
+
offset = op(scan_result, offset);
|
| 208 |
+
}
|
| 209 |
+
*thread_scratch_location = offset;
|
| 210 |
+
};
|
| 211 |
+
|
| 212 |
+
TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 213 |
+
if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
|
| 214 |
+
return previous_warps_sum;
|
| 215 |
+
}
|
| 216 |
+
return op(warp_scan, previous_warps_sum);
|
| 217 |
+
}
|
| 218 |
+
};
|
| 219 |
+
#endif
|
| 220 |
+
#endif
|
| 221 |
+
|
| 222 |
+
template <typename TyGroup, typename TyInputVal, typename TyRetVal>
|
| 223 |
+
_CG_QUALIFIER void check_scan_params() {
|
| 224 |
+
static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
|
| 225 |
+
static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 229 |
+
template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
|
| 230 |
+
_CG_QUALIFIER void check_scan_update_params() {
|
| 231 |
+
check_scan_params<TyGroup, TyInputVal, TyRetVal>();
|
| 232 |
+
static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
|
| 233 |
+
}
|
| 234 |
+
#endif
|
| 235 |
+
|
| 236 |
+
} // details
|
| 237 |
+
|
| 238 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 239 |
+
_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 240 |
+
details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
|
| 241 |
+
|
| 242 |
+
using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 243 |
+
return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
template <typename TyGroup, typename TyVal>
|
| 247 |
+
_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
|
| 248 |
+
return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 252 |
+
_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 253 |
+
details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
|
| 254 |
+
|
| 255 |
+
using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 256 |
+
return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
template <typename TyGroup, typename TyVal>
|
| 260 |
+
_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
|
| 261 |
+
return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 265 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 266 |
+
_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 267 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 268 |
+
|
| 269 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 270 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 274 |
+
_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
|
| 275 |
+
return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 279 |
+
_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 280 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 281 |
+
|
| 282 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 283 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 287 |
+
_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
|
| 288 |
+
return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 292 |
+
_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 293 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 294 |
+
|
| 295 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 296 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 300 |
+
_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
|
| 301 |
+
return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 305 |
+
_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 306 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 307 |
+
|
| 308 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 309 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 313 |
+
_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
|
| 314 |
+
return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 315 |
+
}
|
| 316 |
+
#endif
|
| 317 |
+
|
| 318 |
+
_CG_END_NAMESPACE
|
| 319 |
+
|
| 320 |
+
#endif // _CG_SCAN_H_
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 50 |
+
#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/async.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_REDUCE_H
|
| 50 |
+
#define _COOPERATIVE_GROUPS_REDUCE_H
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/reduce.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#endif //_COOPERATIVE_GROUPS_REDUCE_H
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_SCAN_H
|
| 50 |
+
#define _COOPERATIVE_GROUPS_SCAN_H
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/scan.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#endif //_COOPERATIVE_GROUPS_SCAN_H
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/common_functions.h
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/common_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/common_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__COMMON_FUNCTIONS_H__)
|
| 61 |
+
#define __COMMON_FUNCTIONS_H__
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
#include "builtin_types.h"
|
| 72 |
+
#include "host_defines.h"
|
| 73 |
+
|
| 74 |
+
#define __CUDACC_VER__ "__CUDACC_VER__ is no longer supported. Use __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, and __CUDACC_VER_BUILD__ instead."
|
| 75 |
+
|
| 76 |
+
#ifndef __CUDA_API_VER_MAJOR__
|
| 77 |
+
#define __CUDA_API_VER_MAJOR__ __CUDACC_VER_MAJOR__
|
| 78 |
+
#endif /* __CUDA_API_VER_MAJOR__ */
|
| 79 |
+
|
| 80 |
+
#ifndef __CUDA_API_VER_MINOR__
|
| 81 |
+
#define __CUDA_API_VER_MINOR__ __CUDACC_VER_MINOR__
|
| 82 |
+
#endif /* __CUDA_API_VER_MINOR__ */
|
| 83 |
+
|
| 84 |
+
#if !defined(__CUDACC_RTC__)
|
| 85 |
+
#include <string.h>
|
| 86 |
+
#include <time.h>
|
| 87 |
+
|
| 88 |
+
extern "C"
|
| 89 |
+
{
|
| 90 |
+
#endif /* !__CUDACC_RTC__ */
|
| 91 |
+
extern _CRTIMP __host__ __device__ __device_builtin__ __cudart_builtin__ clock_t __cdecl clock(void)
|
| 92 |
+
#if defined(__QNX__)
|
| 93 |
+
asm("clock32")
|
| 94 |
+
#endif
|
| 95 |
+
__THROW;
|
| 96 |
+
extern __host__ __device__ __device_builtin__ __cudart_builtin__ void* __cdecl memset(void*, int, size_t) __THROW;
|
| 97 |
+
extern __host__ __device__ __device_builtin__ __cudart_builtin__ void* __cdecl memcpy(void*, const void*, size_t) __THROW;
|
| 98 |
+
#if !defined(__CUDACC_RTC__)
|
| 99 |
+
}
|
| 100 |
+
#endif /* !__CUDACC_RTC__ */
|
| 101 |
+
|
| 102 |
+
#if defined(__CUDA_ARCH__)
|
| 103 |
+
|
| 104 |
+
#if defined(__CUDACC_RTC__)
|
| 105 |
+
inline __host__ __device__ void* operator new(size_t, void *p) { return p; }
|
| 106 |
+
inline __host__ __device__ void* operator new[](size_t, void *p) { return p; }
|
| 107 |
+
inline __host__ __device__ void operator delete(void*, void*) { }
|
| 108 |
+
inline __host__ __device__ void operator delete[](void*, void*) { }
|
| 109 |
+
#else /* !__CUDACC_RTC__ */
|
| 110 |
+
#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
|
| 111 |
+
#include <new>
|
| 112 |
+
#endif
|
| 113 |
+
|
| 114 |
+
#if defined (__GNUC__)
|
| 115 |
+
|
| 116 |
+
#define STD \
|
| 117 |
+
std::
|
| 118 |
+
|
| 119 |
+
#else /* __GNUC__ */
|
| 120 |
+
|
| 121 |
+
#define STD
|
| 122 |
+
|
| 123 |
+
#endif /* __GNUC__ */
|
| 124 |
+
|
| 125 |
+
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t, void*) throw();
|
| 126 |
+
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t, void*) throw();
|
| 127 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, void*) throw();
|
| 128 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, void*) throw();
|
| 129 |
+
# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
|
| 130 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t) throw();
|
| 131 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t) throw();
|
| 132 |
+
#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__) */
|
| 133 |
+
#endif /* __CUDACC_RTC__ */
|
| 134 |
+
|
| 135 |
+
#if !defined(__CUDACC_RTC__)
|
| 136 |
+
#include <stdio.h>
|
| 137 |
+
#include <stdlib.h>
|
| 138 |
+
#endif /* !__CUDACC_RTC__ */
|
| 139 |
+
|
| 140 |
+
#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
|
| 141 |
+
namespace std {
|
| 142 |
+
#endif
|
| 143 |
+
extern "C"
|
| 144 |
+
{
|
| 145 |
+
extern
|
| 146 |
+
#if !defined(_MSC_VER) || _MSC_VER < 1900
|
| 147 |
+
_CRTIMP
|
| 148 |
+
#endif
|
| 149 |
+
|
| 150 |
+
#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) )
|
| 151 |
+
__host__ __device__ __device_builtin__ __cudart_builtin__ int __cdecl printf(const char*, ...) __THROW;
|
| 152 |
+
#else /* newer glibc */
|
| 153 |
+
__host__ __device__ __device_builtin__ __cudart_builtin__ int __cdecl printf(const char*, ...);
|
| 154 |
+
#endif /* defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) */
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
extern _CRTIMP __host__ __device__ __cudart_builtin__ void* __cdecl malloc(size_t) __THROW;
|
| 158 |
+
extern _CRTIMP __host__ __device__ __cudart_builtin__ void __cdecl free(void*) __THROW;
|
| 159 |
+
|
| 160 |
+
#if defined(_MSC_VER)
|
| 161 |
+
extern __host__ __device__ __cudart_builtin__ void* __cdecl _alloca(size_t);
|
| 162 |
+
#endif
|
| 163 |
+
|
| 164 |
+
#if defined(__QNX__)
|
| 165 |
+
#undef alloca
|
| 166 |
+
#define alloca(__S) __builtin_alloca(__S)
|
| 167 |
+
#endif
|
| 168 |
+
}
|
| 169 |
+
#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
|
| 170 |
+
} /* std */
|
| 171 |
+
#endif
|
| 172 |
+
|
| 173 |
+
#if !defined(__CUDACC_RTC__)
|
| 174 |
+
#include <assert.h>
|
| 175 |
+
#endif /* !__CUDACC_RTC__ */
|
| 176 |
+
|
| 177 |
+
extern "C"
|
| 178 |
+
{
|
| 179 |
+
#if defined(__CUDACC_RTC__)
|
| 180 |
+
extern __host__ __device__ void __assertfail(const char * __assertion,
|
| 181 |
+
const char *__file,
|
| 182 |
+
unsigned int __line,
|
| 183 |
+
const char *__function,
|
| 184 |
+
size_t charsize);
|
| 185 |
+
#elif defined(__APPLE__)
|
| 186 |
+
#define __builtin_expect(exp,c) (exp)
|
| 187 |
+
extern __host__ __device__ __cudart_builtin__ void __assert_rtn(
|
| 188 |
+
const char *, const char *, int, const char *);
|
| 189 |
+
#elif defined(__ANDROID__)
|
| 190 |
+
extern __host__ __device__ __cudart_builtin__ void __assert2(
|
| 191 |
+
const char *, int, const char *, const char *);
|
| 192 |
+
#elif defined(__QNX__)
|
| 193 |
+
#if !defined(_LIBCPP_VERSION)
|
| 194 |
+
namespace std {
|
| 195 |
+
#endif
|
| 196 |
+
extern __host__ __device__ __cudart_builtin__ void __assert(
|
| 197 |
+
const char *, const char *, unsigned int, const char *);
|
| 198 |
+
#if !defined(_LIBCPP_VERSION)
|
| 199 |
+
}
|
| 200 |
+
#endif
|
| 201 |
+
#elif defined(__HORIZON__)
|
| 202 |
+
extern __host__ __device__ __cudart_builtin__ void __assert_fail(
|
| 203 |
+
const char *, const char *, int, const char *);
|
| 204 |
+
#elif defined(__GNUC__)
|
| 205 |
+
extern __host__ __device__ __cudart_builtin__ void __assert_fail(
|
| 206 |
+
const char *, const char *, unsigned int, const char *)
|
| 207 |
+
__THROW;
|
| 208 |
+
#elif defined(_WIN32)
|
| 209 |
+
extern __host__ __device__ __cudart_builtin__ _CRTIMP void __cdecl _wassert(
|
| 210 |
+
const wchar_t *, const wchar_t *, unsigned);
|
| 211 |
+
#endif
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
#if defined(__CUDACC_RTC__)
|
| 215 |
+
#ifdef NDEBUG
|
| 216 |
+
#define assert(e) (static_cast<void>(0))
|
| 217 |
+
#else /* !NDEBUG */
|
| 218 |
+
#define __ASSERT_STR_HELPER(x) #x
|
| 219 |
+
#define assert(e) ((e) ? static_cast<void>(0)\
|
| 220 |
+
: __assertfail(__ASSERT_STR_HELPER(e), __FILE__,\
|
| 221 |
+
__LINE__, __PRETTY_FUNCTION__,\
|
| 222 |
+
sizeof(char)))
|
| 223 |
+
#endif /* NDEBUG */
|
| 224 |
+
__host__ __device__ void* operator new(size_t);
|
| 225 |
+
__host__ __device__ void* operator new[](size_t);
|
| 226 |
+
__host__ __device__ void operator delete(void*);
|
| 227 |
+
__host__ __device__ void operator delete[](void*);
|
| 228 |
+
# if __cplusplus >= 201402L
|
| 229 |
+
__host__ __device__ void operator delete(void*, size_t);
|
| 230 |
+
__host__ __device__ void operator delete[](void*, size_t);
|
| 231 |
+
#endif /* __cplusplus >= 201402L */
|
| 232 |
+
|
| 233 |
+
#if __cplusplus >= 201703L
|
| 234 |
+
namespace std { enum class align_val_t : size_t {}; }
|
| 235 |
+
__host__ __device__ void* __cdecl operator new(size_t sz, std::align_val_t) noexcept;
|
| 236 |
+
__host__ __device__ void* __cdecl operator new[](size_t sz, std::align_val_t) noexcept;
|
| 237 |
+
__host__ __device__ void __cdecl operator delete(void* ptr, std::align_val_t) noexcept;
|
| 238 |
+
__host__ __device__ void __cdecl operator delete[](void* ptr, std::align_val_t) noexcept;
|
| 239 |
+
__host__ __device__ void __cdecl operator delete(void* ptr, size_t, std::align_val_t) noexcept;
|
| 240 |
+
__host__ __device__ void __cdecl operator delete[](void* ptr, size_t, std::align_val_t) noexcept;
|
| 241 |
+
#endif /* __cplusplus >= 201703L */
|
| 242 |
+
|
| 243 |
+
#else /* !__CUDACC_RTC__ */
|
| 244 |
+
#if defined (__GNUC__)
|
| 245 |
+
|
| 246 |
+
#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
| 247 |
+
|
| 248 |
+
#if (__cplusplus >= 201103L) && ((!(defined(__QNX__) && defined(_LIBCPP_VERSION))) || (defined(__QNX__) && __NV_GLIBCXX_VERSION >= 80300))
|
| 249 |
+
#define THROWBADALLOC
|
| 250 |
+
#else
|
| 251 |
+
#if defined(__ANDROID__) && !defined(_LIBCPP_VERSION) && (defined(__BIONIC__) || __NV_GLIBCXX_VERSION < 40900)
|
| 252 |
+
#define THROWBADALLOC
|
| 253 |
+
#else
|
| 254 |
+
#define THROWBADALLOC throw(STD bad_alloc)
|
| 255 |
+
#endif
|
| 256 |
+
#endif
|
| 257 |
+
#define __DELETE_THROW throw()
|
| 258 |
+
|
| 259 |
+
#undef __NV_GLIBCXX_VERSION
|
| 260 |
+
|
| 261 |
+
#else /* __GNUC__ */
|
| 262 |
+
|
| 263 |
+
#define THROWBADALLOC throw(...)
|
| 264 |
+
|
| 265 |
+
#endif /* __GNUC__ */
|
| 266 |
+
|
| 267 |
+
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t) THROWBADALLOC;
|
| 268 |
+
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t) THROWBADALLOC;
|
| 269 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*) throw();
|
| 270 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*) throw();
|
| 271 |
+
# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
|
| 272 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t) throw();
|
| 273 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t) throw();
|
| 274 |
+
#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__) */
|
| 275 |
+
|
| 276 |
+
#if __cpp_aligned_new
|
| 277 |
+
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t, std::align_val_t);
|
| 278 |
+
extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t, std::align_val_t);
|
| 279 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, std::align_val_t) noexcept;
|
| 280 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, std::align_val_t) noexcept;
|
| 281 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t, std::align_val_t) noexcept;
|
| 282 |
+
extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t, std::align_val_t) noexcept;
|
| 283 |
+
#endif /* __cpp_aligned_new */
|
| 284 |
+
|
| 285 |
+
#undef THROWBADALLOC
|
| 286 |
+
#undef STD
|
| 287 |
+
#endif /* __CUDACC_RTC__ */
|
| 288 |
+
|
| 289 |
+
#endif /* __CUDA_ARCH__ */
|
| 290 |
+
|
| 291 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 292 |
+
|
| 293 |
+
/*******************************************************************************
|
| 294 |
+
* *
|
| 295 |
+
* *
|
| 296 |
+
* *
|
| 297 |
+
*******************************************************************************/
|
| 298 |
+
|
| 299 |
+
#if defined(__CUDACC_RTC__) && (__CUDA_ARCH__ >= 350)
|
| 300 |
+
#include "cuda_device_runtime_api.h"
|
| 301 |
+
#endif
|
| 302 |
+
|
| 303 |
+
#include "math_functions.h"
|
| 304 |
+
|
| 305 |
+
#endif /* !__COMMON_FUNCTIONS_H__ */
|
| 306 |
+
|
| 307 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__)
|
| 308 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 309 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
|
| 310 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2021-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/cudacc_ext.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/cudacc_ext.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__)
|
| 62 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 63 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
|
| 64 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h
ADDED
|
@@ -0,0 +1,1192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/device_double_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/device_double_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__DEVICE_DOUBLE_FUNCTIONS_H__)
|
| 61 |
+
#define __DEVICE_DOUBLE_FUNCTIONS_H__
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
/*******************************************************************************
|
| 72 |
+
* *
|
| 73 |
+
* *
|
| 74 |
+
* *
|
| 75 |
+
*******************************************************************************/
|
| 76 |
+
|
| 77 |
+
#if defined(__CUDACC_RTC__)
|
| 78 |
+
#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
|
| 79 |
+
#else
|
| 80 |
+
#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
|
| 81 |
+
#endif /* __CUDACC_RTC__ */
|
| 82 |
+
|
| 83 |
+
#include "builtin_types.h"
|
| 84 |
+
#include "device_types.h"
|
| 85 |
+
#include "host_defines.h"
|
| 86 |
+
|
| 87 |
+
//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
|
| 88 |
+
#define EXCLUDE_FROM_RTC
|
| 89 |
+
|
| 90 |
+
extern "C"
|
| 91 |
+
{
|
| 92 |
+
/**
|
| 93 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 94 |
+
* \brief Reinterpret bits in a double as a 64-bit signed integer.
|
| 95 |
+
*
|
| 96 |
+
* Reinterpret the bits in the double-precision floating-point value \p x
|
| 97 |
+
* as a signed 64-bit integer.
|
| 98 |
+
* \return Returns reinterpreted value.
|
| 99 |
+
*/
|
| 100 |
+
extern __device__ __device_builtin__ long long int __double_as_longlong(double x);
|
| 101 |
+
/**
|
| 102 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 103 |
+
* \brief Reinterpret bits in a 64-bit signed integer as a double.
|
| 104 |
+
*
|
| 105 |
+
* Reinterpret the bits in the 64-bit signed integer value \p x as
|
| 106 |
+
* a double-precision floating-point value.
|
| 107 |
+
* \return Returns reinterpreted value.
|
| 108 |
+
*/
|
| 109 |
+
extern __device__ __device_builtin__ double __longlong_as_double(long long int x);
|
| 110 |
+
/**
|
| 111 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 112 |
+
* \brief Compute
|
| 113 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 114 |
+
* \xmlonly
|
| 115 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 116 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 117 |
+
* <m:mi>x</m:mi>
|
| 118 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 119 |
+
* <m:mi>y</m:mi>
|
| 120 |
+
* <m:mo>+</m:mo>
|
| 121 |
+
* <m:mi>z</m:mi>
|
| 122 |
+
* </m:math>
|
| 123 |
+
* </d4p_MathML>
|
| 124 |
+
* \endxmlonly
|
| 125 |
+
* as a single operation in round-to-nearest-even mode.
|
| 126 |
+
*
|
| 127 |
+
* Computes the value of
|
| 128 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 129 |
+
* \xmlonly
|
| 130 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 131 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 132 |
+
* <m:mi>x</m:mi>
|
| 133 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 134 |
+
* <m:mi>y</m:mi>
|
| 135 |
+
* <m:mo>+</m:mo>
|
| 136 |
+
* <m:mi>z</m:mi>
|
| 137 |
+
* </m:math>
|
| 138 |
+
* </d4p_MathML>
|
| 139 |
+
* \endxmlonly
|
| 140 |
+
* as a single ternary operation, rounding the
|
| 141 |
+
* result once in round-to-nearest-even mode.
|
| 142 |
+
*
|
| 143 |
+
* \return Returns the rounded value of
|
| 144 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 145 |
+
* \xmlonly
|
| 146 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 147 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 148 |
+
* <m:mi>x</m:mi>
|
| 149 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 150 |
+
* <m:mi>y</m:mi>
|
| 151 |
+
* <m:mo>+</m:mo>
|
| 152 |
+
* <m:mi>z</m:mi>
|
| 153 |
+
* </m:math>
|
| 154 |
+
* </d4p_MathML>
|
| 155 |
+
* \endxmlonly
|
| 156 |
+
* as a single operation.
|
| 157 |
+
* - fmaf(
|
| 158 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 159 |
+
* \xmlonly
|
| 160 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 161 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 162 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 163 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 164 |
+
* </m:math>
|
| 165 |
+
* </d4p_MathML>
|
| 166 |
+
* \endxmlonly
|
| 167 |
+
* ,
|
| 168 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 169 |
+
* \xmlonly
|
| 170 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 171 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 172 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 173 |
+
* <m:mn>0</m:mn>
|
| 174 |
+
* </m:math>
|
| 175 |
+
* </d4p_MathML>
|
| 176 |
+
* \endxmlonly
|
| 177 |
+
* , \p z) returns NaN.
|
| 178 |
+
* - fmaf(
|
| 179 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 180 |
+
* \xmlonly
|
| 181 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 182 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 183 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 184 |
+
* <m:mn>0</m:mn>
|
| 185 |
+
* </m:math>
|
| 186 |
+
* </d4p_MathML>
|
| 187 |
+
* \endxmlonly
|
| 188 |
+
* ,
|
| 189 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 190 |
+
* \xmlonly
|
| 191 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 192 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 193 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 194 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 195 |
+
* </m:math>
|
| 196 |
+
* </d4p_MathML>
|
| 197 |
+
* \endxmlonly
|
| 198 |
+
* , \p z) returns NaN.
|
| 199 |
+
* - fmaf(\p x, \p y,
|
| 200 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 201 |
+
* \xmlonly
|
| 202 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 203 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 204 |
+
* <m:mo>-</m:mo>
|
| 205 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 206 |
+
* </m:math>
|
| 207 |
+
* </d4p_MathML>
|
| 208 |
+
* \endxmlonly
|
| 209 |
+
* ) returns NaN if
|
| 210 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 211 |
+
* \xmlonly
|
| 212 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 213 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 214 |
+
* <m:mi>x</m:mi>
|
| 215 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 216 |
+
* <m:mi>y</m:mi>
|
| 217 |
+
* </m:math>
|
| 218 |
+
* </d4p_MathML>
|
| 219 |
+
* \endxmlonly
|
| 220 |
+
* is an exact
|
| 221 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 222 |
+
* \xmlonly
|
| 223 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 224 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 225 |
+
* <m:mo>+</m:mo>
|
| 226 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 227 |
+
* </m:math>
|
| 228 |
+
* </d4p_MathML>
|
| 229 |
+
* \endxmlonly
|
| 230 |
+
* .
|
| 231 |
+
* - fmaf(\p x, \p y,
|
| 232 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 233 |
+
* \xmlonly
|
| 234 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 235 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 236 |
+
* <m:mo>+</m:mo>
|
| 237 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 238 |
+
* </m:math>
|
| 239 |
+
* </d4p_MathML>
|
| 240 |
+
* \endxmlonly
|
| 241 |
+
* ) returns NaN if
|
| 242 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 243 |
+
* \xmlonly
|
| 244 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 245 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 246 |
+
* <m:mi>x</m:mi>
|
| 247 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 248 |
+
* <m:mi>y</m:mi>
|
| 249 |
+
* </m:math>
|
| 250 |
+
* </d4p_MathML>
|
| 251 |
+
* \endxmlonly
|
| 252 |
+
* is an exact
|
| 253 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 254 |
+
* \xmlonly
|
| 255 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 256 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 257 |
+
* <m:mo>-</m:mo>
|
| 258 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 259 |
+
* </m:math>
|
| 260 |
+
* </d4p_MathML>
|
| 261 |
+
* \endxmlonly
|
| 262 |
+
* .
|
| 263 |
+
*
|
| 264 |
+
* \note_accuracy_double
|
| 265 |
+
*/
|
| 266 |
+
extern __device__ __device_builtin__ double __fma_rn(double x, double y, double z);
|
| 267 |
+
/**
|
| 268 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 269 |
+
* \brief Compute
|
| 270 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 271 |
+
* \xmlonly
|
| 272 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 273 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 274 |
+
* <m:mi>x</m:mi>
|
| 275 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 276 |
+
* <m:mi>y</m:mi>
|
| 277 |
+
* <m:mo>+</m:mo>
|
| 278 |
+
* <m:mi>z</m:mi>
|
| 279 |
+
* </m:math>
|
| 280 |
+
* </d4p_MathML>
|
| 281 |
+
* \endxmlonly
|
| 282 |
+
* as a single operation in round-towards-zero mode.
|
| 283 |
+
*
|
| 284 |
+
* Computes the value of
|
| 285 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 286 |
+
* \xmlonly
|
| 287 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 288 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 289 |
+
* <m:mi>x</m:mi>
|
| 290 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 291 |
+
* <m:mi>y</m:mi>
|
| 292 |
+
* <m:mo>+</m:mo>
|
| 293 |
+
* <m:mi>z</m:mi>
|
| 294 |
+
* </m:math>
|
| 295 |
+
* </d4p_MathML>
|
| 296 |
+
* \endxmlonly
|
| 297 |
+
* as a single ternary operation, rounding the
|
| 298 |
+
* result once in round-towards-zero mode.
|
| 299 |
+
*
|
| 300 |
+
* \return Returns the rounded value of
|
| 301 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 302 |
+
* \xmlonly
|
| 303 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 304 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 305 |
+
* <m:mi>x</m:mi>
|
| 306 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 307 |
+
* <m:mi>y</m:mi>
|
| 308 |
+
* <m:mo>+</m:mo>
|
| 309 |
+
* <m:mi>z</m:mi>
|
| 310 |
+
* </m:math>
|
| 311 |
+
* </d4p_MathML>
|
| 312 |
+
* \endxmlonly
|
| 313 |
+
* as a single operation.
|
| 314 |
+
* - fmaf(
|
| 315 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 316 |
+
* \xmlonly
|
| 317 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 318 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 319 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 320 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 321 |
+
* </m:math>
|
| 322 |
+
* </d4p_MathML>
|
| 323 |
+
* \endxmlonly
|
| 324 |
+
* ,
|
| 325 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 326 |
+
* \xmlonly
|
| 327 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 328 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 329 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 330 |
+
* <m:mn>0</m:mn>
|
| 331 |
+
* </m:math>
|
| 332 |
+
* </d4p_MathML>
|
| 333 |
+
* \endxmlonly
|
| 334 |
+
* , \p z) returns NaN.
|
| 335 |
+
* - fmaf(
|
| 336 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 337 |
+
* \xmlonly
|
| 338 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 339 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 340 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 341 |
+
* <m:mn>0</m:mn>
|
| 342 |
+
* </m:math>
|
| 343 |
+
* </d4p_MathML>
|
| 344 |
+
* \endxmlonly
|
| 345 |
+
* ,
|
| 346 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 347 |
+
* \xmlonly
|
| 348 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 349 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 350 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 351 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 352 |
+
* </m:math>
|
| 353 |
+
* </d4p_MathML>
|
| 354 |
+
* \endxmlonly
|
| 355 |
+
* , \p z) returns NaN.
|
| 356 |
+
* - fmaf(\p x, \p y,
|
| 357 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 358 |
+
* \xmlonly
|
| 359 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 360 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 361 |
+
* <m:mo>-</m:mo>
|
| 362 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 363 |
+
* </m:math>
|
| 364 |
+
* </d4p_MathML>
|
| 365 |
+
* \endxmlonly
|
| 366 |
+
* ) returns NaN if
|
| 367 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 368 |
+
* \xmlonly
|
| 369 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 370 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 371 |
+
* <m:mi>x</m:mi>
|
| 372 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 373 |
+
* <m:mi>y</m:mi>
|
| 374 |
+
* </m:math>
|
| 375 |
+
* </d4p_MathML>
|
| 376 |
+
* \endxmlonly
|
| 377 |
+
* is an exact
|
| 378 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 379 |
+
* \xmlonly
|
| 380 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 381 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 382 |
+
* <m:mo>+</m:mo>
|
| 383 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 384 |
+
* </m:math>
|
| 385 |
+
* </d4p_MathML>
|
| 386 |
+
* \endxmlonly
|
| 387 |
+
* .
|
| 388 |
+
* - fmaf(\p x, \p y,
|
| 389 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 390 |
+
* \xmlonly
|
| 391 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 392 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 393 |
+
* <m:mo>+</m:mo>
|
| 394 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 395 |
+
* </m:math>
|
| 396 |
+
* </d4p_MathML>
|
| 397 |
+
* \endxmlonly
|
| 398 |
+
* ) returns NaN if
|
| 399 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 400 |
+
* \xmlonly
|
| 401 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 402 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 403 |
+
* <m:mi>x</m:mi>
|
| 404 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 405 |
+
* <m:mi>y</m:mi>
|
| 406 |
+
* </m:math>
|
| 407 |
+
* </d4p_MathML>
|
| 408 |
+
* \endxmlonly
|
| 409 |
+
* is an exact
|
| 410 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 411 |
+
* \xmlonly
|
| 412 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 413 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 414 |
+
* <m:mo>-</m:mo>
|
| 415 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 416 |
+
* </m:math>
|
| 417 |
+
* </d4p_MathML>
|
| 418 |
+
* \endxmlonly
|
| 419 |
+
* .
|
| 420 |
+
*
|
| 421 |
+
* \note_accuracy_double
|
| 422 |
+
*/
|
| 423 |
+
extern __device__ __device_builtin__ double __fma_rz(double x, double y, double z);
|
| 424 |
+
/**
|
| 425 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 426 |
+
* \brief Compute
|
| 427 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 428 |
+
* \xmlonly
|
| 429 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 430 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 431 |
+
* <m:mi>x</m:mi>
|
| 432 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 433 |
+
* <m:mi>y</m:mi>
|
| 434 |
+
* <m:mo>+</m:mo>
|
| 435 |
+
* <m:mi>z</m:mi>
|
| 436 |
+
* </m:math>
|
| 437 |
+
* </d4p_MathML>
|
| 438 |
+
* \endxmlonly
|
| 439 |
+
* as a single operation in round-up mode.
|
| 440 |
+
*
|
| 441 |
+
* Computes the value of
|
| 442 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 443 |
+
* \xmlonly
|
| 444 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 445 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 446 |
+
* <m:mi>x</m:mi>
|
| 447 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 448 |
+
* <m:mi>y</m:mi>
|
| 449 |
+
* <m:mo>+</m:mo>
|
| 450 |
+
* <m:mi>z</m:mi>
|
| 451 |
+
* </m:math>
|
| 452 |
+
* </d4p_MathML>
|
| 453 |
+
* \endxmlonly
|
| 454 |
+
* as a single ternary operation, rounding the
|
| 455 |
+
* result once in round-up (to positive infinity) mode.
|
| 456 |
+
*
|
| 457 |
+
* \return Returns the rounded value of
|
| 458 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 459 |
+
* \xmlonly
|
| 460 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 461 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 462 |
+
* <m:mi>x</m:mi>
|
| 463 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 464 |
+
* <m:mi>y</m:mi>
|
| 465 |
+
* <m:mo>+</m:mo>
|
| 466 |
+
* <m:mi>z</m:mi>
|
| 467 |
+
* </m:math>
|
| 468 |
+
* </d4p_MathML>
|
| 469 |
+
* \endxmlonly
|
| 470 |
+
* as a single operation.
|
| 471 |
+
* - fmaf(
|
| 472 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 473 |
+
* \xmlonly
|
| 474 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 475 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 476 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 477 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 478 |
+
* </m:math>
|
| 479 |
+
* </d4p_MathML>
|
| 480 |
+
* \endxmlonly
|
| 481 |
+
* ,
|
| 482 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 483 |
+
* \xmlonly
|
| 484 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 485 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 486 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 487 |
+
* <m:mn>0</m:mn>
|
| 488 |
+
* </m:math>
|
| 489 |
+
* </d4p_MathML>
|
| 490 |
+
* \endxmlonly
|
| 491 |
+
* , \p z) returns NaN.
|
| 492 |
+
* - fmaf(
|
| 493 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 494 |
+
* \xmlonly
|
| 495 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 496 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 497 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 498 |
+
* <m:mn>0</m:mn>
|
| 499 |
+
* </m:math>
|
| 500 |
+
* </d4p_MathML>
|
| 501 |
+
* \endxmlonly
|
| 502 |
+
* ,
|
| 503 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 504 |
+
* \xmlonly
|
| 505 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 506 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 507 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 508 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 509 |
+
* </m:math>
|
| 510 |
+
* </d4p_MathML>
|
| 511 |
+
* \endxmlonly
|
| 512 |
+
* , \p z) returns NaN.
|
| 513 |
+
* - fmaf(\p x, \p y,
|
| 514 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 515 |
+
* \xmlonly
|
| 516 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 517 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 518 |
+
* <m:mo>-</m:mo>
|
| 519 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 520 |
+
* </m:math>
|
| 521 |
+
* </d4p_MathML>
|
| 522 |
+
* \endxmlonly
|
| 523 |
+
* ) returns NaN if
|
| 524 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 525 |
+
* \xmlonly
|
| 526 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 527 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 528 |
+
* <m:mi>x</m:mi>
|
| 529 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 530 |
+
* <m:mi>y</m:mi>
|
| 531 |
+
* </m:math>
|
| 532 |
+
* </d4p_MathML>
|
| 533 |
+
* \endxmlonly
|
| 534 |
+
* is an exact
|
| 535 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 536 |
+
* \xmlonly
|
| 537 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 538 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 539 |
+
* <m:mo>+</m:mo>
|
| 540 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 541 |
+
* </m:math>
|
| 542 |
+
* </d4p_MathML>
|
| 543 |
+
* \endxmlonly
|
| 544 |
+
* .
|
| 545 |
+
* - fmaf(\p x, \p y,
|
| 546 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 547 |
+
* \xmlonly
|
| 548 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 549 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 550 |
+
* <m:mo>+</m:mo>
|
| 551 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 552 |
+
* </m:math>
|
| 553 |
+
* </d4p_MathML>
|
| 554 |
+
* \endxmlonly
|
| 555 |
+
* ) returns NaN if
|
| 556 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 557 |
+
* \xmlonly
|
| 558 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 559 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 560 |
+
* <m:mi>x</m:mi>
|
| 561 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 562 |
+
* <m:mi>y</m:mi>
|
| 563 |
+
* </m:math>
|
| 564 |
+
* </d4p_MathML>
|
| 565 |
+
* \endxmlonly
|
| 566 |
+
* is an exact
|
| 567 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 568 |
+
* \xmlonly
|
| 569 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 570 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 571 |
+
* <m:mo>-</m:mo>
|
| 572 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 573 |
+
* </m:math>
|
| 574 |
+
* </d4p_MathML>
|
| 575 |
+
* \endxmlonly
|
| 576 |
+
* .
|
| 577 |
+
*
|
| 578 |
+
* \note_accuracy_double
|
| 579 |
+
*/
|
| 580 |
+
extern __device__ __device_builtin__ double __fma_ru(double x, double y, double z);
|
| 581 |
+
/**
|
| 582 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 583 |
+
* \brief Compute
|
| 584 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 585 |
+
* \xmlonly
|
| 586 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 587 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 588 |
+
* <m:mi>x</m:mi>
|
| 589 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 590 |
+
* <m:mi>y</m:mi>
|
| 591 |
+
* <m:mo>+</m:mo>
|
| 592 |
+
* <m:mi>z</m:mi>
|
| 593 |
+
* </m:math>
|
| 594 |
+
* </d4p_MathML>
|
| 595 |
+
* \endxmlonly
|
| 596 |
+
* as a single operation in round-down mode.
|
| 597 |
+
*
|
| 598 |
+
* Computes the value of
|
| 599 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 600 |
+
* \xmlonly
|
| 601 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 602 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 603 |
+
* <m:mi>x</m:mi>
|
| 604 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 605 |
+
* <m:mi>y</m:mi>
|
| 606 |
+
* <m:mo>+</m:mo>
|
| 607 |
+
* <m:mi>z</m:mi>
|
| 608 |
+
* </m:math>
|
| 609 |
+
* </d4p_MathML>
|
| 610 |
+
* \endxmlonly
|
| 611 |
+
* as a single ternary operation, rounding the
|
| 612 |
+
* result once in round-down (to negative infinity) mode.
|
| 613 |
+
*
|
| 614 |
+
* \return Returns the rounded value of
|
| 615 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 616 |
+
* \xmlonly
|
| 617 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 618 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 619 |
+
* <m:mi>x</m:mi>
|
| 620 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 621 |
+
* <m:mi>y</m:mi>
|
| 622 |
+
* <m:mo>+</m:mo>
|
| 623 |
+
* <m:mi>z</m:mi>
|
| 624 |
+
* </m:math>
|
| 625 |
+
* </d4p_MathML>
|
| 626 |
+
* \endxmlonly
|
| 627 |
+
* as a single operation.
|
| 628 |
+
* - fmaf(
|
| 629 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 630 |
+
* \xmlonly
|
| 631 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 632 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 633 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 634 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 635 |
+
* </m:math>
|
| 636 |
+
* </d4p_MathML>
|
| 637 |
+
* \endxmlonly
|
| 638 |
+
* ,
|
| 639 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 640 |
+
* \xmlonly
|
| 641 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 642 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 643 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 644 |
+
* <m:mn>0</m:mn>
|
| 645 |
+
* </m:math>
|
| 646 |
+
* </d4p_MathML>
|
| 647 |
+
* \endxmlonly
|
| 648 |
+
* , \p z) returns NaN.
|
| 649 |
+
* - fmaf(
|
| 650 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 651 |
+
* \xmlonly
|
| 652 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 653 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 654 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 655 |
+
* <m:mn>0</m:mn>
|
| 656 |
+
* </m:math>
|
| 657 |
+
* </d4p_MathML>
|
| 658 |
+
* \endxmlonly
|
| 659 |
+
* ,
|
| 660 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 661 |
+
* \xmlonly
|
| 662 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 663 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 664 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 665 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 666 |
+
* </m:math>
|
| 667 |
+
* </d4p_MathML>
|
| 668 |
+
* \endxmlonly
|
| 669 |
+
* , \p z) returns NaN.
|
| 670 |
+
* - fmaf(\p x, \p y,
|
| 671 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 672 |
+
* \xmlonly
|
| 673 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 674 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 675 |
+
* <m:mo>-</m:mo>
|
| 676 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 677 |
+
* </m:math>
|
| 678 |
+
* </d4p_MathML>
|
| 679 |
+
* \endxmlonly
|
| 680 |
+
* ) returns NaN if
|
| 681 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 682 |
+
* \xmlonly
|
| 683 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 684 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 685 |
+
* <m:mi>x</m:mi>
|
| 686 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 687 |
+
* <m:mi>y</m:mi>
|
| 688 |
+
* </m:math>
|
| 689 |
+
* </d4p_MathML>
|
| 690 |
+
* \endxmlonly
|
| 691 |
+
* is an exact
|
| 692 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 693 |
+
* \xmlonly
|
| 694 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 695 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 696 |
+
* <m:mo>+</m:mo>
|
| 697 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 698 |
+
* </m:math>
|
| 699 |
+
* </d4p_MathML>
|
| 700 |
+
* \endxmlonly
|
| 701 |
+
* .
|
| 702 |
+
* - fmaf(\p x, \p y,
|
| 703 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 704 |
+
* \xmlonly
|
| 705 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 706 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 707 |
+
* <m:mo>+</m:mo>
|
| 708 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 709 |
+
* </m:math>
|
| 710 |
+
* </d4p_MathML>
|
| 711 |
+
* \endxmlonly
|
| 712 |
+
* ) returns NaN if
|
| 713 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 714 |
+
* \xmlonly
|
| 715 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 716 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 717 |
+
* <m:mi>x</m:mi>
|
| 718 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 719 |
+
* <m:mi>y</m:mi>
|
| 720 |
+
* </m:math>
|
| 721 |
+
* </d4p_MathML>
|
| 722 |
+
* \endxmlonly
|
| 723 |
+
* is an exact
|
| 724 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 725 |
+
* \xmlonly
|
| 726 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 727 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 728 |
+
* <m:mo>-</m:mo>
|
| 729 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 730 |
+
* </m:math>
|
| 731 |
+
* </d4p_MathML>
|
| 732 |
+
* \endxmlonly
|
| 733 |
+
* .
|
| 734 |
+
*
|
| 735 |
+
* \note_accuracy_double
|
| 736 |
+
*/
|
| 737 |
+
extern __device__ __device_builtin__ double __fma_rd(double x, double y, double z);
|
| 738 |
+
/**
|
| 739 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 740 |
+
* \brief Add two floating-point values in round-to-nearest-even mode.
|
| 741 |
+
*
|
| 742 |
+
* Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
|
| 743 |
+
*
|
| 744 |
+
* \return Returns \p x + \p y.
|
| 745 |
+
*
|
| 746 |
+
* \note_accuracy_double
|
| 747 |
+
* \note_nofma
|
| 748 |
+
*/
|
| 749 |
+
extern __device__ __device_builtin__ double __dadd_rn(double x, double y);
|
| 750 |
+
/**
|
| 751 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 752 |
+
* \brief Add two floating-point values in round-towards-zero mode.
|
| 753 |
+
*
|
| 754 |
+
* Adds two floating-point values \p x and \p y in round-towards-zero mode.
|
| 755 |
+
*
|
| 756 |
+
* \return Returns \p x + \p y.
|
| 757 |
+
*
|
| 758 |
+
* \note_accuracy_double
|
| 759 |
+
* \note_nofma
|
| 760 |
+
*/
|
| 761 |
+
extern __device__ __device_builtin__ double __dadd_rz(double x, double y);
|
| 762 |
+
/**
|
| 763 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 764 |
+
* \brief Add two floating-point values in round-up mode.
|
| 765 |
+
*
|
| 766 |
+
* Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
|
| 767 |
+
*
|
| 768 |
+
* \return Returns \p x + \p y.
|
| 769 |
+
*
|
| 770 |
+
* \note_accuracy_double
|
| 771 |
+
* \note_nofma
|
| 772 |
+
*/
|
| 773 |
+
extern __device__ __device_builtin__ double __dadd_ru(double x, double y);
|
| 774 |
+
/**
|
| 775 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 776 |
+
* \brief Add two floating-point values in round-down mode.
|
| 777 |
+
*
|
| 778 |
+
* Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
|
| 779 |
+
*
|
| 780 |
+
* \return Returns \p x + \p y.
|
| 781 |
+
*
|
| 782 |
+
* \note_accuracy_double
|
| 783 |
+
* \note_nofma
|
| 784 |
+
*/
|
| 785 |
+
extern __device__ __device_builtin__ double __dadd_rd(double x, double y);
|
| 786 |
+
/**
|
| 787 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 788 |
+
* \brief Subtract two floating-point values in round-to-nearest-even mode.
|
| 789 |
+
*
|
| 790 |
+
* Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
|
| 791 |
+
*
|
| 792 |
+
* \return Returns \p x - \p y.
|
| 793 |
+
*
|
| 794 |
+
* \note_accuracy_double
|
| 795 |
+
* \note_nofma
|
| 796 |
+
*/
|
| 797 |
+
extern __device__ __device_builtin__ double __dsub_rn(double x, double y);
|
| 798 |
+
/**
|
| 799 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 800 |
+
* \brief Subtract two floating-point values in round-towards-zero mode.
|
| 801 |
+
*
|
| 802 |
+
* Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
|
| 803 |
+
*
|
| 804 |
+
* \return Returns \p x - \p y.
|
| 805 |
+
*
|
| 806 |
+
* \note_accuracy_double
|
| 807 |
+
* \note_nofma
|
| 808 |
+
*/
|
| 809 |
+
extern __device__ __device_builtin__ double __dsub_rz(double x, double y);
|
| 810 |
+
/**
|
| 811 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 812 |
+
* \brief Subtract two floating-point values in round-up mode.
|
| 813 |
+
*
|
| 814 |
+
* Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
|
| 815 |
+
*
|
| 816 |
+
* \return Returns \p x - \p y.
|
| 817 |
+
*
|
| 818 |
+
* \note_accuracy_double
|
| 819 |
+
* \note_nofma
|
| 820 |
+
*/
|
| 821 |
+
extern __device__ __device_builtin__ double __dsub_ru(double x, double y);
|
| 822 |
+
/**
|
| 823 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 824 |
+
* \brief Subtract two floating-point values in round-down mode.
|
| 825 |
+
*
|
| 826 |
+
* Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
|
| 827 |
+
*
|
| 828 |
+
* \return Returns \p x - \p y.
|
| 829 |
+
*
|
| 830 |
+
* \note_accuracy_double
|
| 831 |
+
* \note_nofma
|
| 832 |
+
*/
|
| 833 |
+
extern __device__ __device_builtin__ double __dsub_rd(double x, double y);
|
| 834 |
+
/**
|
| 835 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 836 |
+
* \brief Multiply two floating-point values in round-to-nearest-even mode.
|
| 837 |
+
*
|
| 838 |
+
* Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
|
| 839 |
+
*
|
| 840 |
+
* \return Returns \p x * \p y.
|
| 841 |
+
*
|
| 842 |
+
* \note_accuracy_double
|
| 843 |
+
* \note_nofma
|
| 844 |
+
*/
|
| 845 |
+
extern __device__ __device_builtin__ double __dmul_rn(double x, double y);
|
| 846 |
+
/**
|
| 847 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 848 |
+
* \brief Multiply two floating-point values in round-towards-zero mode.
|
| 849 |
+
*
|
| 850 |
+
* Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
|
| 851 |
+
*
|
| 852 |
+
* \return Returns \p x * \p y.
|
| 853 |
+
*
|
| 854 |
+
* \note_accuracy_double
|
| 855 |
+
* \note_nofma
|
| 856 |
+
*/
|
| 857 |
+
extern __device__ __device_builtin__ double __dmul_rz(double x, double y);
|
| 858 |
+
/**
|
| 859 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 860 |
+
* \brief Multiply two floating-point values in round-up mode.
|
| 861 |
+
*
|
| 862 |
+
* Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
|
| 863 |
+
*
|
| 864 |
+
* \return Returns \p x * \p y.
|
| 865 |
+
*
|
| 866 |
+
* \note_accuracy_double
|
| 867 |
+
* \note_nofma
|
| 868 |
+
*/
|
| 869 |
+
extern __device__ __device_builtin__ double __dmul_ru(double x, double y);
|
| 870 |
+
/**
|
| 871 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 872 |
+
* \brief Multiply two floating-point values in round-down mode.
|
| 873 |
+
*
|
| 874 |
+
* Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
|
| 875 |
+
*
|
| 876 |
+
* \return Returns \p x * \p y.
|
| 877 |
+
*
|
| 878 |
+
* \note_accuracy_double
|
| 879 |
+
* \note_nofma
|
| 880 |
+
*/
|
| 881 |
+
extern __device__ __device_builtin__ double __dmul_rd(double x, double y);
|
| 882 |
+
/**
|
| 883 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 884 |
+
* \brief Convert a double to a float in round-to-nearest-even mode.
|
| 885 |
+
*
|
| 886 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 887 |
+
* floating-point value in round-to-nearest-even mode.
|
| 888 |
+
* \return Returns converted value.
|
| 889 |
+
*/
|
| 890 |
+
extern __device__ __device_builtin__ float __double2float_rn(double x);
|
| 891 |
+
/**
|
| 892 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 893 |
+
* \brief Convert a double to a float in round-towards-zero mode.
|
| 894 |
+
*
|
| 895 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 896 |
+
* floating-point value in round-towards-zero mode.
|
| 897 |
+
* \return Returns converted value.
|
| 898 |
+
*/
|
| 899 |
+
extern __device__ __device_builtin__ float __double2float_rz(double x);
|
| 900 |
+
/**
|
| 901 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 902 |
+
* \brief Convert a double to a float in round-up mode.
|
| 903 |
+
*
|
| 904 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 905 |
+
* floating-point value in round-up (to positive infinity) mode.
|
| 906 |
+
* \return Returns converted value.
|
| 907 |
+
*/
|
| 908 |
+
extern __device__ __device_builtin__ float __double2float_ru(double x);
|
| 909 |
+
/**
|
| 910 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 911 |
+
* \brief Convert a double to a float in round-down mode.
|
| 912 |
+
*
|
| 913 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 914 |
+
* floating-point value in round-down (to negative infinity) mode.
|
| 915 |
+
* \return Returns converted value.
|
| 916 |
+
*/
|
| 917 |
+
extern __device__ __device_builtin__ float __double2float_rd(double x);
|
| 918 |
+
/**
|
| 919 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 920 |
+
* \brief Convert a double to a signed int in round-to-nearest-even mode.
|
| 921 |
+
*
|
| 922 |
+
* Convert the double-precision floating-point value \p x to a
|
| 923 |
+
* signed integer value in round-to-nearest-even mode.
|
| 924 |
+
* \return Returns converted value.
|
| 925 |
+
*/
|
| 926 |
+
extern __device__ __device_builtin__ int __double2int_rn(double x);
|
| 927 |
+
/**
|
| 928 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 929 |
+
* \brief Convert a double to a signed int in round-up mode.
|
| 930 |
+
*
|
| 931 |
+
* Convert the double-precision floating-point value \p x to a
|
| 932 |
+
* signed integer value in round-up (to positive infinity) mode.
|
| 933 |
+
* \return Returns converted value.
|
| 934 |
+
*/
|
| 935 |
+
extern __device__ __device_builtin__ int __double2int_ru(double x);
|
| 936 |
+
/**
|
| 937 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 938 |
+
* \brief Convert a double to a signed int in round-down mode.
|
| 939 |
+
*
|
| 940 |
+
* Convert the double-precision floating-point value \p x to a
|
| 941 |
+
* signed integer value in round-down (to negative infinity) mode.
|
| 942 |
+
* \return Returns converted value.
|
| 943 |
+
*/
|
| 944 |
+
extern __device__ __device_builtin__ int __double2int_rd(double x);
|
| 945 |
+
/**
|
| 946 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 947 |
+
* \brief Convert a double to an unsigned int in round-to-nearest-even mode.
|
| 948 |
+
*
|
| 949 |
+
* Convert the double-precision floating-point value \p x to an
|
| 950 |
+
* unsigned integer value in round-to-nearest-even mode.
|
| 951 |
+
* \return Returns converted value.
|
| 952 |
+
*/
|
| 953 |
+
extern __device__ __device_builtin__ unsigned int __double2uint_rn(double x);
|
| 954 |
+
/**
|
| 955 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 956 |
+
* \brief Convert a double to an unsigned int in round-up mode.
|
| 957 |
+
*
|
| 958 |
+
* Convert the double-precision floating-point value \p x to an
|
| 959 |
+
* unsigned integer value in round-up (to positive infinity) mode.
|
| 960 |
+
* \return Returns converted value.
|
| 961 |
+
*/
|
| 962 |
+
extern __device__ __device_builtin__ unsigned int __double2uint_ru(double x);
|
| 963 |
+
/**
|
| 964 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 965 |
+
* \brief Convert a double to an unsigned int in round-down mode.
|
| 966 |
+
*
|
| 967 |
+
* Convert the double-precision floating-point value \p x to an
|
| 968 |
+
* unsigned integer value in round-down (to negative infinity) mode.
|
| 969 |
+
* \return Returns converted value.
|
| 970 |
+
*/
|
| 971 |
+
extern __device__ __device_builtin__ unsigned int __double2uint_rd(double x);
|
| 972 |
+
/**
|
| 973 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 974 |
+
* \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
|
| 975 |
+
*
|
| 976 |
+
* Convert the double-precision floating-point value \p x to a
|
| 977 |
+
* signed 64-bit integer value in round-to-nearest-even mode.
|
| 978 |
+
* \return Returns converted value.
|
| 979 |
+
*/
|
| 980 |
+
extern __device__ __device_builtin__ long long int __double2ll_rn(double x);
|
| 981 |
+
/**
|
| 982 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 983 |
+
* \brief Convert a double to a signed 64-bit int in round-up mode.
|
| 984 |
+
*
|
| 985 |
+
* Convert the double-precision floating-point value \p x to a
|
| 986 |
+
* signed 64-bit integer value in round-up (to positive infinity) mode.
|
| 987 |
+
* \return Returns converted value.
|
| 988 |
+
*/
|
| 989 |
+
extern __device__ __device_builtin__ long long int __double2ll_ru(double x);
|
| 990 |
+
/**
|
| 991 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 992 |
+
* \brief Convert a double to a signed 64-bit int in round-down mode.
|
| 993 |
+
*
|
| 994 |
+
* Convert the double-precision floating-point value \p x to a
|
| 995 |
+
* signed 64-bit integer value in round-down (to negative infinity) mode.
|
| 996 |
+
* \return Returns converted value.
|
| 997 |
+
*/
|
| 998 |
+
extern __device__ __device_builtin__ long long int __double2ll_rd(double x);
|
| 999 |
+
/**
|
| 1000 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1001 |
+
* \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
|
| 1002 |
+
*
|
| 1003 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1004 |
+
* unsigned 64-bit integer value in round-to-nearest-even mode.
|
| 1005 |
+
* \return Returns converted value.
|
| 1006 |
+
*/
|
| 1007 |
+
extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
|
| 1008 |
+
/**
|
| 1009 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1010 |
+
* \brief Convert a double to an unsigned 64-bit int in round-up mode.
|
| 1011 |
+
*
|
| 1012 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1013 |
+
* unsigned 64-bit integer value in round-up (to positive infinity) mode.
|
| 1014 |
+
* \return Returns converted value.
|
| 1015 |
+
*/
|
| 1016 |
+
extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
|
| 1017 |
+
/**
|
| 1018 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1019 |
+
* \brief Convert a double to an unsigned 64-bit int in round-down mode.
|
| 1020 |
+
*
|
| 1021 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1022 |
+
* unsigned 64-bit integer value in round-down (to negative infinity) mode.
|
| 1023 |
+
* \return Returns converted value.
|
| 1024 |
+
*/
|
| 1025 |
+
extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
|
| 1026 |
+
/**
|
| 1027 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1028 |
+
* \brief Convert a signed int to a double.
|
| 1029 |
+
*
|
| 1030 |
+
* Convert the signed integer value \p x to a double-precision floating-point value.
|
| 1031 |
+
* \return Returns converted value.
|
| 1032 |
+
*/
|
| 1033 |
+
extern __device__ __device_builtin__ double __int2double_rn(int x);
|
| 1034 |
+
/**
|
| 1035 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1036 |
+
* \brief Convert an unsigned int to a double.
|
| 1037 |
+
*
|
| 1038 |
+
* Convert the unsigned integer value \p x to a double-precision floating-point value.
|
| 1039 |
+
* \return Returns converted value.
|
| 1040 |
+
*/
|
| 1041 |
+
extern __device__ __device_builtin__ double __uint2double_rn(unsigned int x);
|
| 1042 |
+
/**
|
| 1043 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1044 |
+
* \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
|
| 1045 |
+
*
|
| 1046 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1047 |
+
* value in round-to-nearest-even mode.
|
| 1048 |
+
* \return Returns converted value.
|
| 1049 |
+
*/
|
| 1050 |
+
extern __device__ __device_builtin__ double __ll2double_rn(long long int x);
|
| 1051 |
+
/**
|
| 1052 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1053 |
+
* \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
|
| 1054 |
+
*
|
| 1055 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1056 |
+
* value in round-towards-zero mode.
|
| 1057 |
+
* \return Returns converted value.
|
| 1058 |
+
*/
|
| 1059 |
+
extern __device__ __device_builtin__ double __ll2double_rz(long long int x);
|
| 1060 |
+
/**
|
| 1061 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1062 |
+
* \brief Convert a signed 64-bit int to a double in round-up mode.
|
| 1063 |
+
*
|
| 1064 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1065 |
+
* value in round-up (to positive infinity) mode.
|
| 1066 |
+
* \return Returns converted value.
|
| 1067 |
+
*/
|
| 1068 |
+
extern __device__ __device_builtin__ double __ll2double_ru(long long int x);
|
| 1069 |
+
/**
|
| 1070 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1071 |
+
* \brief Convert a signed 64-bit int to a double in round-down mode.
|
| 1072 |
+
*
|
| 1073 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1074 |
+
* value in round-down (to negative infinity) mode.
|
| 1075 |
+
* \return Returns converted value.
|
| 1076 |
+
*/
|
| 1077 |
+
extern __device__ __device_builtin__ double __ll2double_rd(long long int x);
|
| 1078 |
+
/**
|
| 1079 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1080 |
+
* \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
|
| 1081 |
+
*
|
| 1082 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1083 |
+
* value in round-to-nearest-even mode.
|
| 1084 |
+
* \return Returns converted value.
|
| 1085 |
+
*/
|
| 1086 |
+
extern __device__ __device_builtin__ double __ull2double_rn(unsigned long long int x);
|
| 1087 |
+
/**
|
| 1088 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1089 |
+
* \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
|
| 1090 |
+
*
|
| 1091 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1092 |
+
* value in round-towards-zero mode.
|
| 1093 |
+
* \return Returns converted value.
|
| 1094 |
+
*/
|
| 1095 |
+
extern __device__ __device_builtin__ double __ull2double_rz(unsigned long long int x);
|
| 1096 |
+
/**
|
| 1097 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1098 |
+
* \brief Convert an unsigned 64-bit int to a double in round-up mode.
|
| 1099 |
+
*
|
| 1100 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1101 |
+
* value in round-up (to positive infinity) mode.
|
| 1102 |
+
* \return Returns converted value.
|
| 1103 |
+
*/
|
| 1104 |
+
extern __device__ __device_builtin__ double __ull2double_ru(unsigned long long int x);
|
| 1105 |
+
/**
|
| 1106 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1107 |
+
* \brief Convert an unsigned 64-bit int to a double in round-down mode.
|
| 1108 |
+
*
|
| 1109 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1110 |
+
* value in round-down (to negative infinity) mode.
|
| 1111 |
+
* \return Returns converted value.
|
| 1112 |
+
*/
|
| 1113 |
+
extern __device__ __device_builtin__ double __ull2double_rd(unsigned long long int x);
|
| 1114 |
+
/**
|
| 1115 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1116 |
+
* \brief Reinterpret high 32 bits in a double as a signed integer.
|
| 1117 |
+
*
|
| 1118 |
+
* Reinterpret the high 32 bits in the double-precision floating-point value \p x
|
| 1119 |
+
* as a signed integer.
|
| 1120 |
+
* \return Returns reinterpreted value.
|
| 1121 |
+
*/
|
| 1122 |
+
extern __device__ __device_builtin__ int __double2hiint(double x);
|
| 1123 |
+
/**
|
| 1124 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1125 |
+
* \brief Reinterpret low 32 bits in a double as a signed integer.
|
| 1126 |
+
*
|
| 1127 |
+
* Reinterpret the low 32 bits in the double-precision floating-point value \p x
|
| 1128 |
+
* as a signed integer.
|
| 1129 |
+
* \return Returns reinterpreted value.
|
| 1130 |
+
*/
|
| 1131 |
+
extern __device__ __device_builtin__ int __double2loint(double x);
|
| 1132 |
+
/**
|
| 1133 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1134 |
+
* \brief Reinterpret high and low 32-bit integer values as a double.
|
| 1135 |
+
*
|
| 1136 |
+
* Reinterpret the integer value of \p hi as the high 32 bits of a
|
| 1137 |
+
* double-precision floating-point value and the integer value of \p lo
|
| 1138 |
+
* as the low 32 bits of the same double-precision floating-point value.
|
| 1139 |
+
* \return Returns reinterpreted value.
|
| 1140 |
+
*/
|
| 1141 |
+
extern __device__ __device_builtin__ double __hiloint2double(int hi, int lo);
|
| 1142 |
+
}
|
| 1143 |
+
|
| 1144 |
+
/*******************************************************************************
|
| 1145 |
+
* *
|
| 1146 |
+
* *
|
| 1147 |
+
* *
|
| 1148 |
+
*******************************************************************************/
|
| 1149 |
+
|
| 1150 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode);
|
| 1151 |
+
|
| 1152 |
+
#undef EXCLUDE_FROM_RTC
|
| 1153 |
+
|
| 1154 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1155 |
+
|
| 1156 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1157 |
+
|
| 1158 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1159 |
+
|
| 1160 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode = cudaRoundZero);
|
| 1161 |
+
|
| 1162 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode = cudaRoundZero);
|
| 1163 |
+
|
| 1164 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode = cudaRoundZero);
|
| 1165 |
+
|
| 1166 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode = cudaRoundZero);
|
| 1167 |
+
|
| 1168 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1169 |
+
|
| 1170 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1171 |
+
|
| 1172 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1173 |
+
|
| 1174 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1175 |
+
|
| 1176 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode = cudaRoundNearest);
|
| 1177 |
+
|
| 1178 |
+
#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
|
| 1179 |
+
|
| 1180 |
+
|
| 1181 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 1182 |
+
|
| 1183 |
+
#if !defined(__CUDACC_RTC__)
|
| 1184 |
+
#include "device_double_functions.hpp"
|
| 1185 |
+
#endif /* !__CUDACC_RTC__ */
|
| 1186 |
+
|
| 1187 |
+
#endif /* !__DEVICE_DOUBLE_FUNCTIONS_H__ */
|
| 1188 |
+
|
| 1189 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__)
|
| 1190 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 1191 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
|
| 1192 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/device_double_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/device_double_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__DEVICE_DOUBLE_FUNCTIONS_HPP__)
|
| 61 |
+
#define __DEVICE_DOUBLE_FUNCTIONS_HPP__
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
/*******************************************************************************
|
| 72 |
+
* *
|
| 73 |
+
* *
|
| 74 |
+
* *
|
| 75 |
+
*******************************************************************************/
|
| 76 |
+
|
| 77 |
+
#if defined(__CUDACC_RTC__)
|
| 78 |
+
#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
|
| 79 |
+
#else
|
| 80 |
+
#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
|
| 81 |
+
#endif /* __CUDACC_RTC__ */
|
| 82 |
+
|
| 83 |
+
#include "builtin_types.h"
|
| 84 |
+
#include "device_types.h"
|
| 85 |
+
#include "host_defines.h"
|
| 86 |
+
|
| 87 |
+
/*******************************************************************************
|
| 88 |
+
* *
|
| 89 |
+
* *
|
| 90 |
+
* *
|
| 91 |
+
*******************************************************************************/
|
| 92 |
+
|
| 93 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode)
|
| 94 |
+
{
|
| 95 |
+
return mode == cudaRoundZero ? __fma_rz(a, b, c) :
|
| 96 |
+
mode == cudaRoundPosInf ? __fma_ru(a, b, c) :
|
| 97 |
+
mode == cudaRoundMinInf ? __fma_rd(a, b, c) :
|
| 98 |
+
__fma_rn(a, b, c);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode)
|
| 102 |
+
{
|
| 103 |
+
return mode == cudaRoundZero ? __dmul_rz(a, b) :
|
| 104 |
+
mode == cudaRoundPosInf ? __dmul_ru(a, b) :
|
| 105 |
+
mode == cudaRoundMinInf ? __dmul_rd(a, b) :
|
| 106 |
+
__dmul_rn(a, b);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode)
|
| 110 |
+
{
|
| 111 |
+
return mode == cudaRoundZero ? __dadd_rz(a, b) :
|
| 112 |
+
mode == cudaRoundPosInf ? __dadd_ru(a, b) :
|
| 113 |
+
mode == cudaRoundMinInf ? __dadd_rd(a, b) :
|
| 114 |
+
__dadd_rn(a, b);
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode)
|
| 118 |
+
{
|
| 119 |
+
return mode == cudaRoundZero ? __dsub_rz(a, b) :
|
| 120 |
+
mode == cudaRoundPosInf ? __dsub_ru(a, b) :
|
| 121 |
+
mode == cudaRoundMinInf ? __dsub_rd(a, b) :
|
| 122 |
+
__dsub_rn(a, b);
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode)
|
| 126 |
+
{
|
| 127 |
+
return mode == cudaRoundNearest ? __double2int_rn(a) :
|
| 128 |
+
mode == cudaRoundPosInf ? __double2int_ru(a) :
|
| 129 |
+
mode == cudaRoundMinInf ? __double2int_rd(a) :
|
| 130 |
+
__double2int_rz(a);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode)
|
| 134 |
+
{
|
| 135 |
+
return mode == cudaRoundNearest ? __double2uint_rn(a) :
|
| 136 |
+
mode == cudaRoundPosInf ? __double2uint_ru(a) :
|
| 137 |
+
mode == cudaRoundMinInf ? __double2uint_rd(a) :
|
| 138 |
+
__double2uint_rz(a);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode)
|
| 142 |
+
{
|
| 143 |
+
return mode == cudaRoundNearest ? __double2ll_rn(a) :
|
| 144 |
+
mode == cudaRoundPosInf ? __double2ll_ru(a) :
|
| 145 |
+
mode == cudaRoundMinInf ? __double2ll_rd(a) :
|
| 146 |
+
__double2ll_rz(a);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode)
|
| 150 |
+
{
|
| 151 |
+
return mode == cudaRoundNearest ? __double2ull_rn(a) :
|
| 152 |
+
mode == cudaRoundPosInf ? __double2ull_ru(a) :
|
| 153 |
+
mode == cudaRoundMinInf ? __double2ull_rd(a) :
|
| 154 |
+
__double2ull_rz(a);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode)
|
| 158 |
+
{
|
| 159 |
+
return mode == cudaRoundZero ? __ll2double_rz(a) :
|
| 160 |
+
mode == cudaRoundPosInf ? __ll2double_ru(a) :
|
| 161 |
+
mode == cudaRoundMinInf ? __ll2double_rd(a) :
|
| 162 |
+
__ll2double_rn(a);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode)
|
| 166 |
+
{
|
| 167 |
+
return mode == cudaRoundZero ? __ull2double_rz(a) :
|
| 168 |
+
mode == cudaRoundPosInf ? __ull2double_ru(a) :
|
| 169 |
+
mode == cudaRoundMinInf ? __ull2double_rd(a) :
|
| 170 |
+
__ull2double_rn(a);
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode)
|
| 174 |
+
{
|
| 175 |
+
return (double)a;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode)
|
| 179 |
+
{
|
| 180 |
+
return (double)a;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode)
|
| 184 |
+
{
|
| 185 |
+
return (double)a;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
|
| 189 |
+
|
| 190 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 191 |
+
|
| 192 |
+
#endif /* !__DEVICE_DOUBLE_FUNCTIONS_HPP__ */
|
| 193 |
+
|
| 194 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__)
|
| 195 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 196 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
|
| 197 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp
ADDED
|
@@ -0,0 +1,1197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/device_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/device_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__DEVICE_FUNCTIONS_HPP__)
|
| 61 |
+
#define __DEVICE_FUNCTIONS_HPP__
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
#if defined(__CUDACC_RTC__)
|
| 72 |
+
#define __DEVICE_FUNCTIONS_DECL__ __device__
|
| 73 |
+
#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__
|
| 74 |
+
#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
|
| 75 |
+
#else
|
| 76 |
+
#define __DEVICE_FUNCTIONS_DECL__ __device__
|
| 77 |
+
#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
|
| 78 |
+
#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
|
| 79 |
+
#endif /* __CUDACC_RTC__ */
|
| 80 |
+
|
| 81 |
+
#include "builtin_types.h"
|
| 82 |
+
#include "device_types.h"
|
| 83 |
+
#include "host_defines.h"
|
| 84 |
+
|
| 85 |
+
#undef __DEVICE_FUNCTIONS_DECL__
|
| 86 |
+
#undef __DEVICE_FUNCTIONS_STATIC_DECL__
|
| 87 |
+
|
| 88 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 89 |
+
|
| 90 |
+
/*******************************************************************************
|
| 91 |
+
* *
|
| 92 |
+
* *
|
| 93 |
+
* *
|
| 94 |
+
*******************************************************************************/
|
| 95 |
+
|
| 96 |
+
#ifdef __CUDACC__
|
| 97 |
+
# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
| 98 |
+
#define __CUDA_AND_AT_LEAST_SM_90__
|
| 99 |
+
#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) */
|
| 100 |
+
# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
|
| 101 |
+
#define __CUDA_AND_AT_LEAST_SM_70__
|
| 102 |
+
#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) */
|
| 103 |
+
# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
|
| 104 |
+
#define __CUDA_AND_AT_LEAST_SM_75__
|
| 105 |
+
#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) */
|
| 106 |
+
#endif /* __CUDACC__ */
|
| 107 |
+
|
| 108 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b){
|
| 109 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 110 |
+
int res;
|
| 111 |
+
asm("{max.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
|
| 112 |
+
return res;
|
| 113 |
+
#else
|
| 114 |
+
// Host and older architecture code
|
| 115 |
+
int ans = max(a, b);
|
| 116 |
+
|
| 117 |
+
return (ans > 0) ? ans : 0;
|
| 118 |
+
#endif
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b){
|
| 122 |
+
unsigned int res;
|
| 123 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 124 |
+
asm("{max.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
|
| 125 |
+
#elif defined(__CUDA_ARCH__)
|
| 126 |
+
res = __vmaxs2(__vmaxs2(a, b), 0U);
|
| 127 |
+
#else
|
| 128 |
+
// Host and older architecture code
|
| 129 |
+
// Separate our high and low bit:
|
| 130 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 131 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 132 |
+
|
| 133 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 134 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 135 |
+
|
| 136 |
+
//cast to signed:
|
| 137 |
+
short aS_lo = *(short*)& aU_lo;
|
| 138 |
+
short aS_hi = *(short*)& aU_hi;
|
| 139 |
+
|
| 140 |
+
short bS_lo = *(short*)& bU_lo;
|
| 141 |
+
short bS_hi = *(short*)& bU_hi;
|
| 142 |
+
|
| 143 |
+
// Get answer
|
| 144 |
+
short ansS_lo = (short)max(aS_lo, bS_lo);
|
| 145 |
+
short ansS_hi = (short)max(aS_hi, bS_hi);
|
| 146 |
+
|
| 147 |
+
// relu
|
| 148 |
+
if(ansS_lo < 0){ ansS_lo = 0; }
|
| 149 |
+
if(ansS_hi < 0){ ansS_hi = 0; }
|
| 150 |
+
|
| 151 |
+
// Cast back to unsigned:
|
| 152 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 153 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 154 |
+
|
| 155 |
+
// Put answer back together:
|
| 156 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 157 |
+
#endif
|
| 158 |
+
|
| 159 |
+
return res;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin_s32_relu(const int a, const int b){
|
| 163 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 164 |
+
int res;
|
| 165 |
+
asm("{min.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
|
| 166 |
+
return res;
|
| 167 |
+
#else
|
| 168 |
+
// Host and older architecture code
|
| 169 |
+
int ans = min(a, b);
|
| 170 |
+
|
| 171 |
+
return (ans > 0) ? ans : 0;
|
| 172 |
+
#endif
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b){
|
| 176 |
+
unsigned int res;
|
| 177 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 178 |
+
asm("{min.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
|
| 179 |
+
#elif defined(__CUDA_ARCH__)
|
| 180 |
+
res = __vmaxs2(__vmins2(a, b), 0U);
|
| 181 |
+
#else
|
| 182 |
+
// Host and older architecture code
|
| 183 |
+
// Separate our high and low bit:
|
| 184 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 185 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 186 |
+
|
| 187 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 188 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 189 |
+
|
| 190 |
+
//cast to signed:
|
| 191 |
+
short aS_lo = *(short*)& aU_lo;
|
| 192 |
+
short aS_hi = *(short*)& aU_hi;
|
| 193 |
+
|
| 194 |
+
short bS_lo = *(short*)& bU_lo;
|
| 195 |
+
short bS_hi = *(short*)& bU_hi;
|
| 196 |
+
|
| 197 |
+
// Get answer
|
| 198 |
+
short ansS_lo = (short)min(aS_lo, bS_lo);
|
| 199 |
+
short ansS_hi = (short)min(aS_hi, bS_hi);
|
| 200 |
+
|
| 201 |
+
// relu
|
| 202 |
+
if(ansS_lo < 0){ ansS_lo = 0; }
|
| 203 |
+
if(ansS_hi < 0){ ansS_hi = 0; }
|
| 204 |
+
|
| 205 |
+
// Cast back to unsigned:
|
| 206 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 207 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 208 |
+
|
| 209 |
+
// Put answer back together:
|
| 210 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 211 |
+
#endif
|
| 212 |
+
|
| 213 |
+
return res;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32(const int a, const int b, const int c){
|
| 217 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 218 |
+
int res;
|
| 219 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 220 |
+
"max.s32 t1, %1, %2; \n\t"
|
| 221 |
+
"max.s32 %0, t1, %3;}\n\t"
|
| 222 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 223 |
+
return res;
|
| 224 |
+
#else
|
| 225 |
+
// Host and older architecture code
|
| 226 |
+
return max(max(a, b), c);
|
| 227 |
+
#endif
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 231 |
+
unsigned int res;
|
| 232 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 233 |
+
// Future asm code (naming/syntax may change):
|
| 234 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 235 |
+
"max.s16x2 t1, %1, %2; \n\t"
|
| 236 |
+
"max.s16x2 %0, t1, %3;}\n\t"
|
| 237 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 238 |
+
#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
|
| 239 |
+
res = __vmaxs2(__vmaxs2(a, b), c);
|
| 240 |
+
#else
|
| 241 |
+
// Host and older architecture code
|
| 242 |
+
// Separate our high and low bit:
|
| 243 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 244 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 245 |
+
|
| 246 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 247 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 248 |
+
|
| 249 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 250 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 251 |
+
|
| 252 |
+
//cast to signed:
|
| 253 |
+
short aS_lo = *(short*)& aU_lo;
|
| 254 |
+
short aS_hi = *(short*)& aU_hi;
|
| 255 |
+
|
| 256 |
+
short bS_lo = *(short*)& bU_lo;
|
| 257 |
+
short bS_hi = *(short*)& bU_hi;
|
| 258 |
+
|
| 259 |
+
short cS_lo = *(short*)& cU_lo;
|
| 260 |
+
short cS_hi = *(short*)& cU_hi;
|
| 261 |
+
|
| 262 |
+
// Get answer
|
| 263 |
+
short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
|
| 264 |
+
short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);
|
| 265 |
+
|
| 266 |
+
// Cast back to unsigned:
|
| 267 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 268 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 269 |
+
|
| 270 |
+
// Put answer back together:
|
| 271 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 272 |
+
#endif
|
| 273 |
+
return res;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 277 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 278 |
+
int res;
|
| 279 |
+
asm ("{.reg .u32 t1; \n\t"
|
| 280 |
+
"max.u32 t1, %1, %2; \n\t"
|
| 281 |
+
"max.u32 %0, t1, %3;}\n\t"
|
| 282 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 283 |
+
return res;
|
| 284 |
+
#else
|
| 285 |
+
// Host and older architecture code
|
| 286 |
+
return max(max(a, b), c);
|
| 287 |
+
#endif
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 291 |
+
unsigned int res;
|
| 292 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 293 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 294 |
+
"max.u16x2 t1, %1, %2; \n\t"
|
| 295 |
+
"max.u16x2 %0, t1, %3;}\n\t"
|
| 296 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 297 |
+
#elif defined(__CUDA_ARCH__)
|
| 298 |
+
res = __vmaxu2(__vmaxu2(a, b), c);
|
| 299 |
+
#else
|
| 300 |
+
// Host and older architecture code
|
| 301 |
+
// Separate our high and low bit:
|
| 302 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 303 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 304 |
+
|
| 305 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 306 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 307 |
+
|
| 308 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 309 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 310 |
+
|
| 311 |
+
// Get answer
|
| 312 |
+
unsigned short ansU_lo = (unsigned short)max(max(aU_lo, bU_lo), cU_lo);
|
| 313 |
+
unsigned short ansU_hi = (unsigned short)max(max(aU_hi, bU_hi), cU_hi);
|
| 314 |
+
|
| 315 |
+
// Put answer back together:
|
| 316 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 317 |
+
#endif
|
| 318 |
+
|
| 319 |
+
return res;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32(const int a, const int b, const int c){
|
| 323 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 324 |
+
int res;
|
| 325 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 326 |
+
"min.s32 t1, %1, %2; \n\t"
|
| 327 |
+
"min.s32 %0, t1, %3;}\n\t"
|
| 328 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 329 |
+
return res;
|
| 330 |
+
#else
|
| 331 |
+
// Host and older architecture code
|
| 332 |
+
return min(min(a, b), c);
|
| 333 |
+
#endif
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 337 |
+
unsigned int res;
|
| 338 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 339 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 340 |
+
"min.s16x2 t1, %1, %2; \n\t"
|
| 341 |
+
"min.s16x2 %0, t1, %3;}\n\t"
|
| 342 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 343 |
+
#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
|
| 344 |
+
res = __vmins2(__vmins2(a, b), c);
|
| 345 |
+
#else
|
| 346 |
+
// Host and older architecture code
|
| 347 |
+
// Separate our high and low bit:
|
| 348 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 349 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 350 |
+
|
| 351 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 352 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 353 |
+
|
| 354 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 355 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 356 |
+
|
| 357 |
+
//cast to signed:
|
| 358 |
+
short aS_lo = *(short*)& aU_lo;
|
| 359 |
+
short aS_hi = *(short*)& aU_hi;
|
| 360 |
+
|
| 361 |
+
short bS_lo = *(short*)& bU_lo;
|
| 362 |
+
short bS_hi = *(short*)& bU_hi;
|
| 363 |
+
|
| 364 |
+
short cS_lo = *(short*)& cU_lo;
|
| 365 |
+
short cS_hi = *(short*)& cU_hi;
|
| 366 |
+
|
| 367 |
+
// Get answer
|
| 368 |
+
short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
|
| 369 |
+
short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);
|
| 370 |
+
|
| 371 |
+
// Cast back to unsigned:
|
| 372 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 373 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 374 |
+
|
| 375 |
+
// Put answer back together:
|
| 376 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 377 |
+
#endif
|
| 378 |
+
|
| 379 |
+
return res;
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 383 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 384 |
+
int res;
|
| 385 |
+
asm ("{.reg .u32 t1; \n\t"
|
| 386 |
+
"min.u32 t1, %1, %2; \n\t"
|
| 387 |
+
"min.u32 %0, t1, %3;}\n\t"
|
| 388 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 389 |
+
return res;
|
| 390 |
+
#else
|
| 391 |
+
// Host and older architecture code
|
| 392 |
+
return min(min(a, b), c);
|
| 393 |
+
#endif
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 397 |
+
unsigned int res;
|
| 398 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 399 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 400 |
+
"min.u16x2 t1, %1, %2; \n\t"
|
| 401 |
+
"min.u16x2 %0, t1, %3;}\n\t"
|
| 402 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 403 |
+
#elif defined(__CUDA_ARCH__)
|
| 404 |
+
res = __vminu2(__vminu2(a, b), c);
|
| 405 |
+
#else
|
| 406 |
+
// Host and older architecture code
|
| 407 |
+
// Separate our high and low bit:
|
| 408 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 409 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 410 |
+
|
| 411 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 412 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 413 |
+
|
| 414 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 415 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 416 |
+
|
| 417 |
+
// Get answer
|
| 418 |
+
unsigned short ansU_lo = (unsigned short)min(min(aU_lo, bU_lo), cU_lo);
|
| 419 |
+
unsigned short ansU_hi = (unsigned short)min(min(aU_hi, bU_hi), cU_hi);
|
| 420 |
+
|
| 421 |
+
// Put answer back together:
|
| 422 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 423 |
+
#endif
|
| 424 |
+
|
| 425 |
+
return res;
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32_relu(const int a, const int b, const int c){
|
| 429 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 430 |
+
int res;
|
| 431 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 432 |
+
"max.s32.relu t1, %1, %2; \n\t"
|
| 433 |
+
"max.s32.relu %0, t1, %3;}\n\t"
|
| 434 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 435 |
+
return res;
|
| 436 |
+
#else
|
| 437 |
+
// Host and older architecture code
|
| 438 |
+
int ans = max(max(a, b), c);
|
| 439 |
+
|
| 440 |
+
return (ans > 0) ? ans : 0;
|
| 441 |
+
#endif
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 445 |
+
unsigned int res;
|
| 446 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 447 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 448 |
+
"max.s16x2.relu t1, %1, %2; \n\t"
|
| 449 |
+
"max.s16x2.relu %0, t1, %3;}\n\t"
|
| 450 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 451 |
+
#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
|
| 452 |
+
res = __vimax_s16x2_relu(__vmaxs2(a, b), c);
|
| 453 |
+
#else
|
| 454 |
+
// Host and older architecture code
|
| 455 |
+
// Separate our high and low bit:
|
| 456 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 457 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 458 |
+
|
| 459 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 460 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 461 |
+
|
| 462 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 463 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 464 |
+
|
| 465 |
+
//cast to signed:
|
| 466 |
+
short aS_lo = *(short*)& aU_lo;
|
| 467 |
+
short aS_hi = *(short*)& aU_hi;
|
| 468 |
+
|
| 469 |
+
short bS_lo = *(short*)& bU_lo;
|
| 470 |
+
short bS_hi = *(short*)& bU_hi;
|
| 471 |
+
|
| 472 |
+
short cS_lo = *(short*)& cU_lo;
|
| 473 |
+
short cS_hi = *(short*)& cU_hi;
|
| 474 |
+
|
| 475 |
+
// Get answer
|
| 476 |
+
short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
|
| 477 |
+
short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);
|
| 478 |
+
|
| 479 |
+
// relu
|
| 480 |
+
if(ansS_lo < 0){ansS_lo = 0;}
|
| 481 |
+
if(ansS_hi < 0){ansS_hi = 0;}
|
| 482 |
+
|
| 483 |
+
// Cast back to unsigned:
|
| 484 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 485 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 486 |
+
|
| 487 |
+
// Put answer back together:
|
| 488 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 489 |
+
#endif
|
| 490 |
+
|
| 491 |
+
return res;
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32_relu(const int a, const int b, const int c){
|
| 495 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 496 |
+
int res;
|
| 497 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 498 |
+
"min.s32.relu t1, %1, %2; \n\t"
|
| 499 |
+
"min.s32.relu %0, t1, %3;}\n\t"
|
| 500 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 501 |
+
return res;
|
| 502 |
+
#else
|
| 503 |
+
// Host and older architecture code
|
| 504 |
+
int ans = min(min(a, b), c);
|
| 505 |
+
|
| 506 |
+
return (ans > 0) ? ans : 0;
|
| 507 |
+
#endif
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 511 |
+
unsigned res;
|
| 512 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 513 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 514 |
+
"min.s16x2.relu t1, %1, %2; \n\t"
|
| 515 |
+
"min.s16x2.relu %0, t1, %3;}\n\t"
|
| 516 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 517 |
+
#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
|
| 518 |
+
res = __vimin_s16x2_relu(__vmins2(a, b), c);
|
| 519 |
+
#else
|
| 520 |
+
// Host and older architecture code
|
| 521 |
+
// Separate our high and low bit:
|
| 522 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 523 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 524 |
+
|
| 525 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 526 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 527 |
+
|
| 528 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 529 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 530 |
+
|
| 531 |
+
//cast to signed:
|
| 532 |
+
short aS_lo = *(short*)& aU_lo;
|
| 533 |
+
short aS_hi = *(short*)& aU_hi;
|
| 534 |
+
|
| 535 |
+
short bS_lo = *(short*)& bU_lo;
|
| 536 |
+
short bS_hi = *(short*)& bU_hi;
|
| 537 |
+
|
| 538 |
+
short cS_lo = *(short*)& cU_lo;
|
| 539 |
+
short cS_hi = *(short*)& cU_hi;
|
| 540 |
+
|
| 541 |
+
// Get answer
|
| 542 |
+
short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
|
| 543 |
+
short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);
|
| 544 |
+
|
| 545 |
+
// relu
|
| 546 |
+
if(ansS_lo < 0){ansS_lo = 0;}
|
| 547 |
+
if(ansS_hi < 0){ansS_hi = 0;}
|
| 548 |
+
|
| 549 |
+
// Cast back to unsigned:
|
| 550 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 551 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 552 |
+
|
| 553 |
+
// Put answer back together:
|
| 554 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 555 |
+
#endif
|
| 556 |
+
|
| 557 |
+
return res;
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32(const int a, const int b, const int c){
|
| 561 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 562 |
+
int res;
|
| 563 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 564 |
+
"add.s32 t1, %1, %2; \n\t"
|
| 565 |
+
"max.s32 %0, t1, %3;}\n\t"
|
| 566 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 567 |
+
return res;
|
| 568 |
+
#else
|
| 569 |
+
// Host and older architecture code
|
| 570 |
+
return max(a + b, c);
|
| 571 |
+
#endif
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 575 |
+
unsigned int res;
|
| 576 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 577 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 578 |
+
"add.s16x2 t1, %1, %2; \n\t"
|
| 579 |
+
"max.s16x2 %0, t1, %3;}\n\t"
|
| 580 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 581 |
+
#elif defined(__CUDA_ARCH__)
|
| 582 |
+
res = __vmaxs2(__vadd2(a, b), c);
|
| 583 |
+
#else
|
| 584 |
+
// Host and older architecture code
|
| 585 |
+
// Separate our high and low bit:
|
| 586 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 587 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 588 |
+
|
| 589 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 590 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 591 |
+
|
| 592 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 593 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 594 |
+
|
| 595 |
+
//cast to signed:
|
| 596 |
+
short aS_lo = *(short*)& aU_lo;
|
| 597 |
+
short aS_hi = *(short*)& aU_hi;
|
| 598 |
+
|
| 599 |
+
short bS_lo = *(short*)& bU_lo;
|
| 600 |
+
short bS_hi = *(short*)& bU_hi;
|
| 601 |
+
|
| 602 |
+
short cS_lo = *(short*)& cU_lo;
|
| 603 |
+
short cS_hi = *(short*)& cU_hi;
|
| 604 |
+
|
| 605 |
+
// Get answer
|
| 606 |
+
short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
|
| 607 |
+
short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);
|
| 608 |
+
|
| 609 |
+
// Cast back to unsigned:
|
| 610 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 611 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 612 |
+
|
| 613 |
+
// Put answer back together:
|
| 614 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 615 |
+
#endif
|
| 616 |
+
|
| 617 |
+
return res;
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 621 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 622 |
+
unsigned int res;
|
| 623 |
+
asm ("{.reg .u32 t1; \n\t"
|
| 624 |
+
"add.u32 t1, %1, %2; \n\t"
|
| 625 |
+
"max.u32 %0, t1, %3;}\n\t"
|
| 626 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 627 |
+
return res;
|
| 628 |
+
#else
|
| 629 |
+
// Host and older architecture code
|
| 630 |
+
return max(a + b, c);
|
| 631 |
+
#endif
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 635 |
+
unsigned int res;
|
| 636 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 637 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 638 |
+
"add.u16x2 t1, %1, %2; \n\t"
|
| 639 |
+
"max.u16x2 %0, t1, %3;}\n\t"
|
| 640 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 641 |
+
#elif defined(__CUDA_ARCH__)
|
| 642 |
+
res = __vmaxu2(__vadd2(a, b), c);
|
| 643 |
+
#else
|
| 644 |
+
// Host and older architecture code
|
| 645 |
+
// Separate our high and low bit:
|
| 646 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 647 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 648 |
+
|
| 649 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 650 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 651 |
+
|
| 652 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 653 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 654 |
+
|
| 655 |
+
// Get answer
|
| 656 |
+
unsigned short ansU_lo = (unsigned short)max((unsigned short)(aU_lo + bU_lo), cU_lo);
|
| 657 |
+
unsigned short ansU_hi = (unsigned short)max((unsigned short)(aU_hi + bU_hi), cU_hi);
|
| 658 |
+
|
| 659 |
+
// Put answer back together:
|
| 660 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 661 |
+
#endif
|
| 662 |
+
|
| 663 |
+
return res;
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32(const int a, const int b, const int c){
|
| 667 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 668 |
+
int res;
|
| 669 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 670 |
+
"add.s32 t1, %1, %2; \n\t"
|
| 671 |
+
"min.s32 %0, t1, %3;}\n\t"
|
| 672 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 673 |
+
return res;
|
| 674 |
+
#else
|
| 675 |
+
// Host and older architecture code
|
| 676 |
+
return min(a + b, c);
|
| 677 |
+
#endif
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 681 |
+
unsigned int res;
|
| 682 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 683 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 684 |
+
"add.s16x2 t1, %1, %2; \n\t"
|
| 685 |
+
"min.s16x2 %0, t1, %3;}\n\t"
|
| 686 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 687 |
+
#elif defined(__CUDA_ARCH__)
|
| 688 |
+
res = __vmins2(__vadd2(a, b), c);
|
| 689 |
+
#else
|
| 690 |
+
// Host and older architecture code
|
| 691 |
+
// Separate our high and low bit:
|
| 692 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 693 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 694 |
+
|
| 695 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 696 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 697 |
+
|
| 698 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 699 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 700 |
+
|
| 701 |
+
//cast to signed:
|
| 702 |
+
short aS_lo = *(short*)& aU_lo;
|
| 703 |
+
short aS_hi = *(short*)& aU_hi;
|
| 704 |
+
|
| 705 |
+
short bS_lo = *(short*)& bU_lo;
|
| 706 |
+
short bS_hi = *(short*)& bU_hi;
|
| 707 |
+
|
| 708 |
+
short cS_lo = *(short*)& cU_lo;
|
| 709 |
+
short cS_hi = *(short*)& cU_hi;
|
| 710 |
+
|
| 711 |
+
// Get answer
|
| 712 |
+
short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
|
| 713 |
+
short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);
|
| 714 |
+
|
| 715 |
+
// Cast back to unsigned:
|
| 716 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 717 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 718 |
+
|
| 719 |
+
// Put answer back together:
|
| 720 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 721 |
+
#endif
|
| 722 |
+
|
| 723 |
+
return res;
|
| 724 |
+
}
|
| 725 |
+
|
| 726 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 727 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 728 |
+
unsigned int res;
|
| 729 |
+
asm ("{.reg .u32 t1; \n\t"
|
| 730 |
+
"add.u32 t1, %1, %2; \n\t"
|
| 731 |
+
"min.u32 %0, t1, %3;}\n\t"
|
| 732 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 733 |
+
return res;
|
| 734 |
+
#else
|
| 735 |
+
// Host and older architecture code
|
| 736 |
+
return min(a + b, c);
|
| 737 |
+
#endif
|
| 738 |
+
}
|
| 739 |
+
|
| 740 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 741 |
+
unsigned int res;
|
| 742 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 743 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 744 |
+
"add.u16x2 t1, %1, %2; \n\t"
|
| 745 |
+
"min.u16x2 %0, t1, %3;}\n\t"
|
| 746 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 747 |
+
#elif defined(__CUDA_ARCH__)
|
| 748 |
+
res = __vminu2(__vadd2(a, b), c);
|
| 749 |
+
#else
|
| 750 |
+
// Host and older architecture code
|
| 751 |
+
// Separate our high and low bit:
|
| 752 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 753 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 754 |
+
|
| 755 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 756 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 757 |
+
|
| 758 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 759 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 760 |
+
|
| 761 |
+
// Get answer
|
| 762 |
+
unsigned short ansU_lo = (unsigned short)min((unsigned short)(aU_lo + bU_lo), cU_lo);
|
| 763 |
+
unsigned short ansU_hi = (unsigned short)min((unsigned short)(aU_hi + bU_hi), cU_hi);
|
| 764 |
+
|
| 765 |
+
// Put answer back together:
|
| 766 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 767 |
+
#endif
|
| 768 |
+
|
| 769 |
+
return res;
|
| 770 |
+
}
|
| 771 |
+
|
| 772 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32_relu(const int a, const int b, const int c){
|
| 773 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 774 |
+
int res;
|
| 775 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 776 |
+
"add.s32 t1, %1, %2; \n\t"
|
| 777 |
+
"max.s32.relu %0, t1, %3;}\n\t"
|
| 778 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 779 |
+
return res;
|
| 780 |
+
#else
|
| 781 |
+
// Host and older architecture code
|
| 782 |
+
int ans = max(a + b, c);
|
| 783 |
+
|
| 784 |
+
return (ans > 0) ? ans : 0;
|
| 785 |
+
#endif
|
| 786 |
+
}
|
| 787 |
+
|
| 788 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 789 |
+
unsigned int res;
|
| 790 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 791 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 792 |
+
"add.s16x2 t1, %1, %2; \n\t"
|
| 793 |
+
"max.s16x2.relu %0, t1, %3;}\n\t"
|
| 794 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 795 |
+
#elif defined(__CUDA_ARCH__)
|
| 796 |
+
res = __vimax_s16x2_relu(__vadd2(a, b), c);
|
| 797 |
+
#else
|
| 798 |
+
// Host and older architecture code
|
| 799 |
+
// Separate our high and low bit:
|
| 800 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 801 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 802 |
+
|
| 803 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 804 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 805 |
+
|
| 806 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 807 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 808 |
+
|
| 809 |
+
//cast to signed:
|
| 810 |
+
short aS_lo = *(short*)& aU_lo;
|
| 811 |
+
short aS_hi = *(short*)& aU_hi;
|
| 812 |
+
|
| 813 |
+
short bS_lo = *(short*)& bU_lo;
|
| 814 |
+
short bS_hi = *(short*)& bU_hi;
|
| 815 |
+
|
| 816 |
+
short cS_lo = *(short*)& cU_lo;
|
| 817 |
+
short cS_hi = *(short*)& cU_hi;
|
| 818 |
+
|
| 819 |
+
// Get answer
|
| 820 |
+
short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
|
| 821 |
+
short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);
|
| 822 |
+
|
| 823 |
+
if(ansS_lo < 0){ansS_lo = 0;}
|
| 824 |
+
if(ansS_hi < 0){ansS_hi = 0;}
|
| 825 |
+
|
| 826 |
+
// Cast back to unsigned:
|
| 827 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 828 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 829 |
+
|
| 830 |
+
// Put answer back together:
|
| 831 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 832 |
+
#endif
|
| 833 |
+
|
| 834 |
+
return res;
|
| 835 |
+
}
|
| 836 |
+
|
| 837 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32_relu(const int a, const int b, const int c){
|
| 838 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 839 |
+
int res;
|
| 840 |
+
asm ("{.reg .s32 t1; \n\t"
|
| 841 |
+
"add.s32 t1, %1, %2; \n\t"
|
| 842 |
+
"min.s32.relu %0, t1, %3;}\n\t"
|
| 843 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 844 |
+
return res;
|
| 845 |
+
#else
|
| 846 |
+
// Host and older architecture code
|
| 847 |
+
int ans = min(a + b, c);
|
| 848 |
+
|
| 849 |
+
return (ans > 0) ? ans : 0;
|
| 850 |
+
#endif
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
|
| 854 |
+
unsigned int res;
|
| 855 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 856 |
+
asm ("{.reg .b32 t1; \n\t"
|
| 857 |
+
"add.s16x2 t1, %1, %2; \n\t"
|
| 858 |
+
"min.s16x2.relu %0, t1, %3;}\n\t"
|
| 859 |
+
: "=r"(res) : "r"(a), "r"(b), "r"(c));
|
| 860 |
+
#elif defined(__CUDA_ARCH__)
|
| 861 |
+
res = __vimin_s16x2_relu(__vadd2(a, b), c);
|
| 862 |
+
#else
|
| 863 |
+
// Host and older architecture code
|
| 864 |
+
// Separate our high and low bit:
|
| 865 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 866 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 867 |
+
|
| 868 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 869 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 870 |
+
|
| 871 |
+
unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
|
| 872 |
+
unsigned short cU_hi = (unsigned short)(c >> 16);
|
| 873 |
+
|
| 874 |
+
//cast to signed:
|
| 875 |
+
short aS_lo = *(short*)& aU_lo;
|
| 876 |
+
short aS_hi = *(short*)& aU_hi;
|
| 877 |
+
|
| 878 |
+
short bS_lo = *(short*)& bU_lo;
|
| 879 |
+
short bS_hi = *(short*)& bU_hi;
|
| 880 |
+
|
| 881 |
+
short cS_lo = *(short*)& cU_lo;
|
| 882 |
+
short cS_hi = *(short*)& cU_hi;
|
| 883 |
+
|
| 884 |
+
// Get answer
|
| 885 |
+
short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
|
| 886 |
+
short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);
|
| 887 |
+
|
| 888 |
+
if(ansS_lo < 0){ansS_lo = 0;}
|
| 889 |
+
if(ansS_hi < 0){ansS_hi = 0;}
|
| 890 |
+
|
| 891 |
+
// Cast back to unsigned:
|
| 892 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 893 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 894 |
+
|
| 895 |
+
// Put answer back together:
|
| 896 |
+
res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 897 |
+
#endif
|
| 898 |
+
|
| 899 |
+
return res;
|
| 900 |
+
}
|
| 901 |
+
|
| 902 |
+
// vimax vimin with predicate
|
| 903 |
+
// *pred gets set to '(a >= b)'
|
| 904 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmax_s32(const int a, const int b, bool* const pred){
|
| 905 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 906 |
+
int val;
|
| 907 |
+
unsigned int predicate_local;
|
| 908 |
+
asm ("{ .reg .pred __$temp1;\n\t"
|
| 909 |
+
" setp.ge.s32 __$temp1, %2, %3;\n\t"
|
| 910 |
+
" selp.s32 %0, %2, %3, __$temp1;\n\t"
|
| 911 |
+
" selp.s32 %1, 1, 0, __$temp1;}\n\t"
|
| 912 |
+
: "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
|
| 913 |
+
|
| 914 |
+
*pred = (bool)predicate_local;
|
| 915 |
+
return val;
|
| 916 |
+
#else
|
| 917 |
+
// Host and older architecture code
|
| 918 |
+
int ans = max(a, b);
|
| 919 |
+
|
| 920 |
+
*pred = (a >= b);
|
| 921 |
+
return ans;
|
| 922 |
+
#endif
|
| 923 |
+
}
|
| 924 |
+
|
| 925 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred){
|
| 926 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 927 |
+
unsigned int val;
|
| 928 |
+
unsigned int predicate_local;
|
| 929 |
+
asm ("{ .reg .pred __$temp1;\n\t"
|
| 930 |
+
" setp.ge.u32 __$temp1, %2, %3;\n\t"
|
| 931 |
+
" selp.u32 %0, %2, %3, __$temp1;\n\t"
|
| 932 |
+
" selp.u32 %1, 1, 0, __$temp1;}\n\t"
|
| 933 |
+
: "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
|
| 934 |
+
|
| 935 |
+
*pred = (bool)predicate_local;
|
| 936 |
+
return val;
|
| 937 |
+
#else
|
| 938 |
+
// Host and older architecture code
|
| 939 |
+
unsigned int ans = max(a, b);
|
| 940 |
+
|
| 941 |
+
*pred = (a >= b);
|
| 942 |
+
return ans;
|
| 943 |
+
#endif
|
| 944 |
+
}
|
| 945 |
+
|
| 946 |
+
// *pred gets set to '(a <= b)'
|
| 947 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmin_s32(const int a, const int b, bool* const pred){
|
| 948 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 949 |
+
int val;
|
| 950 |
+
unsigned int predicate_local;
|
| 951 |
+
asm ("{ .reg .pred __$temp1;\n\t"
|
| 952 |
+
" setp.le.s32 __$temp1, %2, %3;\n\t"
|
| 953 |
+
" selp.s32 %0, %2, %3, __$temp1;\n\t"
|
| 954 |
+
" selp.s32 %1, 1, 0, __$temp1;}\n\t"
|
| 955 |
+
: "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
|
| 956 |
+
|
| 957 |
+
*pred = (bool)predicate_local;
|
| 958 |
+
return val;
|
| 959 |
+
#else
|
| 960 |
+
// Host and older architecture code
|
| 961 |
+
int ans = min(a, b);
|
| 962 |
+
|
| 963 |
+
*pred = (a <= b);
|
| 964 |
+
return ans;
|
| 965 |
+
#endif
|
| 966 |
+
}
|
| 967 |
+
|
| 968 |
+
// *pred gets set to '(a <= b)'
|
| 969 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred){
|
| 970 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 971 |
+
unsigned int val;
|
| 972 |
+
unsigned int predicate_local;
|
| 973 |
+
asm ("{ .reg .pred __$temp1;\n\t"
|
| 974 |
+
" setp.le.u32 __$temp1, %2, %3;\n\t"
|
| 975 |
+
" selp.u32 %0, %2, %3, __$temp1;\n\t"
|
| 976 |
+
" selp.u32 %1, 1, 0, __$temp1;}\n\t"
|
| 977 |
+
: "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
|
| 978 |
+
|
| 979 |
+
*pred = (bool)predicate_local;
|
| 980 |
+
return val;
|
| 981 |
+
#else
|
| 982 |
+
// Host and older architecture code
|
| 983 |
+
unsigned int ans = min(a, b);
|
| 984 |
+
|
| 985 |
+
*pred = (a <= b);
|
| 986 |
+
return ans;
|
| 987 |
+
#endif
|
| 988 |
+
}
|
| 989 |
+
|
| 990 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
|
| 991 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 992 |
+
unsigned int val;
|
| 993 |
+
unsigned int predicate_local_hi;
|
| 994 |
+
unsigned int predicate_local_lo;
|
| 995 |
+
asm ("{.reg .pred pu, pv; \n\t"
|
| 996 |
+
".reg .s16 rs0, rs1, rs2, rs3; \n\t"
|
| 997 |
+
"max.s16x2 %0, %3, %4; \n\t"
|
| 998 |
+
"mov.b32 {rs0, rs1}, %0; \n\t"
|
| 999 |
+
"mov.b32 {rs2, rs3}, %3; \n\t"
|
| 1000 |
+
"setp.eq.s16 pv, rs0, rs2; \n\t"
|
| 1001 |
+
"setp.eq.s16 pu, rs1, rs3; \n\t"
|
| 1002 |
+
"selp.b32 %1, 1, 0, pu; \n\t"
|
| 1003 |
+
"selp.b32 %2, 1, 0, pv;} \n\t"
|
| 1004 |
+
: "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
|
| 1005 |
+
|
| 1006 |
+
*pred_hi = (bool)predicate_local_hi;
|
| 1007 |
+
*pred_lo = (bool)predicate_local_lo;
|
| 1008 |
+
return val;
|
| 1009 |
+
#else
|
| 1010 |
+
// Host and older architecture code
|
| 1011 |
+
// Separate our high and low bit:
|
| 1012 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 1013 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 1014 |
+
|
| 1015 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 1016 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 1017 |
+
|
| 1018 |
+
//cast to signed:
|
| 1019 |
+
short aS_lo = *(short*)& aU_lo;
|
| 1020 |
+
short aS_hi = *(short*)& aU_hi;
|
| 1021 |
+
|
| 1022 |
+
short bS_lo = *(short*)& bU_lo;
|
| 1023 |
+
short bS_hi = *(short*)& bU_hi;
|
| 1024 |
+
|
| 1025 |
+
// Get answer
|
| 1026 |
+
short ansS_lo = (short)max(aS_lo, bS_lo);
|
| 1027 |
+
short ansS_hi = (short)max(aS_hi, bS_hi);
|
| 1028 |
+
|
| 1029 |
+
*pred_hi = (aS_hi >= bS_hi);
|
| 1030 |
+
*pred_lo = (aS_lo >= bS_lo);
|
| 1031 |
+
|
| 1032 |
+
// Cast back to unsigned:
|
| 1033 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 1034 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 1035 |
+
|
| 1036 |
+
// Put answer back together:
|
| 1037 |
+
unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 1038 |
+
|
| 1039 |
+
return ans;
|
| 1040 |
+
#endif
|
| 1041 |
+
}
|
| 1042 |
+
|
| 1043 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
|
| 1044 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 1045 |
+
unsigned int val;
|
| 1046 |
+
unsigned int predicate_local_hi;
|
| 1047 |
+
unsigned int predicate_local_lo;
|
| 1048 |
+
asm ("{.reg .pred pu, pv; \n\t"
|
| 1049 |
+
".reg .u16 rs0, rs1, rs2, rs3; \n\t"
|
| 1050 |
+
"max.u16x2 %0, %3, %4; \n\t"
|
| 1051 |
+
"mov.b32 {rs0, rs1}, %0; \n\t"
|
| 1052 |
+
"mov.b32 {rs2, rs3}, %3; \n\t"
|
| 1053 |
+
"setp.eq.u16 pv, rs0, rs2; \n\t"
|
| 1054 |
+
"setp.eq.u16 pu, rs1, rs3; \n\t"
|
| 1055 |
+
"selp.b32 %1, 1, 0, pu; \n\t"
|
| 1056 |
+
"selp.b32 %2, 1, 0, pv;} \n\t"
|
| 1057 |
+
: "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
|
| 1058 |
+
|
| 1059 |
+
*pred_hi = (bool)predicate_local_hi;
|
| 1060 |
+
*pred_lo = (bool)predicate_local_lo;
|
| 1061 |
+
return val;
|
| 1062 |
+
#else
|
| 1063 |
+
// Host and older architecture code
|
| 1064 |
+
// Separate our high and low bit:
|
| 1065 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 1066 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 1067 |
+
|
| 1068 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 1069 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 1070 |
+
|
| 1071 |
+
// Get answer
|
| 1072 |
+
unsigned short ansU_lo = (unsigned short)max(aU_lo, bU_lo);
|
| 1073 |
+
unsigned short ansU_hi = (unsigned short)max(aU_hi, bU_hi);
|
| 1074 |
+
|
| 1075 |
+
*pred_hi = (aU_hi >= bU_hi);
|
| 1076 |
+
*pred_lo = (aU_lo >= bU_lo);
|
| 1077 |
+
|
| 1078 |
+
// Put answer back together:
|
| 1079 |
+
unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 1080 |
+
|
| 1081 |
+
return ans;
|
| 1082 |
+
#endif
|
| 1083 |
+
}
|
| 1084 |
+
|
| 1085 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
|
| 1086 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 1087 |
+
unsigned int val;
|
| 1088 |
+
unsigned int predicate_local_hi;
|
| 1089 |
+
unsigned int predicate_local_lo;
|
| 1090 |
+
asm ("{.reg .pred pu, pv; \n\t"
|
| 1091 |
+
".reg .u16 rs0, rs1, rs2, rs3; \n\t"
|
| 1092 |
+
"min.s16x2 %0, %3, %4; \n\t"
|
| 1093 |
+
"mov.b32 {rs0, rs1}, %0; \n\t"
|
| 1094 |
+
"mov.b32 {rs2, rs3}, %3; \n\t"
|
| 1095 |
+
"setp.eq.s16 pv, rs0, rs2; \n\t"
|
| 1096 |
+
"setp.eq.s16 pu, rs1, rs3; \n\t"
|
| 1097 |
+
"selp.b32 %1, 1, 0, pu; \n\t"
|
| 1098 |
+
"selp.b32 %2, 1, 0, pv;} \n\t"
|
| 1099 |
+
: "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
|
| 1100 |
+
|
| 1101 |
+
*pred_hi = (bool)predicate_local_hi;
|
| 1102 |
+
*pred_lo = (bool)predicate_local_lo;
|
| 1103 |
+
return val;
|
| 1104 |
+
#else
|
| 1105 |
+
// Host and older architecture code
|
| 1106 |
+
// Separate our high and low bit:
|
| 1107 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 1108 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 1109 |
+
|
| 1110 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 1111 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 1112 |
+
|
| 1113 |
+
//cast to signed:
|
| 1114 |
+
short aS_lo = *(short*)& aU_lo;
|
| 1115 |
+
short aS_hi = *(short*)& aU_hi;
|
| 1116 |
+
|
| 1117 |
+
short bS_lo = *(short*)& bU_lo;
|
| 1118 |
+
short bS_hi = *(short*)& bU_hi;
|
| 1119 |
+
|
| 1120 |
+
// Get answer
|
| 1121 |
+
short ansS_lo = (short)min(aS_lo, bS_lo);
|
| 1122 |
+
short ansS_hi = (short)min(aS_hi, bS_hi);
|
| 1123 |
+
|
| 1124 |
+
*pred_hi = (aS_hi <= bS_hi);
|
| 1125 |
+
*pred_lo = (aS_lo <= bS_lo);
|
| 1126 |
+
|
| 1127 |
+
// Cast back to unsigned:
|
| 1128 |
+
unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
|
| 1129 |
+
unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
|
| 1130 |
+
|
| 1131 |
+
// Put answer back together:
|
| 1132 |
+
unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 1133 |
+
|
| 1134 |
+
return ans;
|
| 1135 |
+
#endif
|
| 1136 |
+
}
|
| 1137 |
+
|
| 1138 |
+
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
|
| 1139 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 1140 |
+
unsigned int val;
|
| 1141 |
+
unsigned int predicate_local_hi;
|
| 1142 |
+
unsigned int predicate_local_lo;
|
| 1143 |
+
asm ("{.reg .pred pu, pv; \n\t"
|
| 1144 |
+
".reg .u16 rs0, rs1, rs2, rs3; \n\t"
|
| 1145 |
+
"min.u16x2 %0, %3, %4; \n\t"
|
| 1146 |
+
"mov.b32 {rs0, rs1}, %0; \n\t"
|
| 1147 |
+
"mov.b32 {rs2, rs3}, %3; \n\t"
|
| 1148 |
+
"setp.eq.u16 pv, rs0, rs2; \n\t"
|
| 1149 |
+
"setp.eq.u16 pu, rs1, rs3; \n\t"
|
| 1150 |
+
"selp.b32 %1, 1, 0, pu; \n\t"
|
| 1151 |
+
"selp.b32 %2, 1, 0, pv;} \n\t"
|
| 1152 |
+
: "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
|
| 1153 |
+
|
| 1154 |
+
*pred_hi = (bool)predicate_local_hi;
|
| 1155 |
+
*pred_lo = (bool)predicate_local_lo;
|
| 1156 |
+
return val;
|
| 1157 |
+
#else
|
| 1158 |
+
// Host and older architecture code
|
| 1159 |
+
// Separate our high and low bit:
|
| 1160 |
+
unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
|
| 1161 |
+
unsigned short aU_hi = (unsigned short)(a >> 16);
|
| 1162 |
+
|
| 1163 |
+
unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
|
| 1164 |
+
unsigned short bU_hi = (unsigned short)(b >> 16);
|
| 1165 |
+
|
| 1166 |
+
// Get answer
|
| 1167 |
+
unsigned short ansU_lo = (unsigned short)min(aU_lo, bU_lo);
|
| 1168 |
+
unsigned short ansU_hi = (unsigned short)min(aU_hi, bU_hi);
|
| 1169 |
+
|
| 1170 |
+
*pred_hi = (aU_hi <= bU_hi);
|
| 1171 |
+
*pred_lo = (aU_lo <= bU_lo);
|
| 1172 |
+
|
| 1173 |
+
// Put answer back together:
|
| 1174 |
+
unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
|
| 1175 |
+
|
| 1176 |
+
return ans;
|
| 1177 |
+
#endif
|
| 1178 |
+
}
|
| 1179 |
+
|
| 1180 |
+
#ifdef __CUDA_AND_AT_LEAST_SM_90__
|
| 1181 |
+
#undef __CUDA_AND_AT_LEAST_SM_90__
|
| 1182 |
+
#endif
|
| 1183 |
+
|
| 1184 |
+
#undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__
|
| 1185 |
+
|
| 1186 |
+
/*******************************************************************************
|
| 1187 |
+
* *
|
| 1188 |
+
* *
|
| 1189 |
+
* *
|
| 1190 |
+
*******************************************************************************/
|
| 1191 |
+
|
| 1192 |
+
#endif /* !__DEVICE_FUNCTIONS_HPP__ */
|
| 1193 |
+
|
| 1194 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__)
|
| 1195 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 1196 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
|
| 1197 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/func_macro.h
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* NVIDIA_COPYRIGHT_BEGIN
|
| 3 |
+
*
|
| 4 |
+
* Copyright (c) 2008-2018, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
+
*
|
| 6 |
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 7 |
+
* and proprietary rights in and to this software, related documentation
|
| 8 |
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
| 9 |
+
* distribution of this software and related documentation without an express
|
| 10 |
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 11 |
+
*
|
| 12 |
+
* NVIDIA_COPYRIGHT_END
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 16 |
+
#if defined(_MSC_VER)
|
| 17 |
+
#pragma message("crt/func_macro.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 18 |
+
#else
|
| 19 |
+
#warning "crt/func_macro.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 20 |
+
#endif
|
| 21 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 22 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#if !defined(__FUNC_MACRO_H__)
|
| 26 |
+
#define __FUNC_MACRO_H__
|
| 27 |
+
|
| 28 |
+
#if !defined(__CUDA_INTERNAL_COMPILATION__)
|
| 29 |
+
|
| 30 |
+
#error -- incorrect inclusion of a cudart header file
|
| 31 |
+
|
| 32 |
+
#endif /* !__CUDA_INTERNAL_COMPILATION__ */
|
| 33 |
+
|
| 34 |
+
#if defined(__GNUC__)
|
| 35 |
+
|
| 36 |
+
#define __func__(decl) \
|
| 37 |
+
inline decl
|
| 38 |
+
|
| 39 |
+
#define __device_func__(decl) \
|
| 40 |
+
static __attribute__((__unused__)) decl
|
| 41 |
+
|
| 42 |
+
#elif defined(_WIN32)
|
| 43 |
+
|
| 44 |
+
#define __func__(decl) \
|
| 45 |
+
static inline decl
|
| 46 |
+
|
| 47 |
+
#define __device_func__(decl) \
|
| 48 |
+
static decl
|
| 49 |
+
|
| 50 |
+
#endif /* __GNUC__ */
|
| 51 |
+
|
| 52 |
+
#endif /* __FUNC_MACRO_H__ */
|
| 53 |
+
|
| 54 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__)
|
| 55 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 56 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
|
| 57 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_config.h
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2024 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/host_config.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/host_config.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__HOST_CONFIG_H__)
|
| 61 |
+
#define __HOST_CONFIG_H__
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#if defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
#if defined(__CUDACC_RTC__)
|
| 72 |
+
|
| 73 |
+
#define _CRTIMP
|
| 74 |
+
#define __THROW
|
| 75 |
+
|
| 76 |
+
#else /* __CUDACC_RTC__ */
|
| 77 |
+
|
| 78 |
+
/* check for host compilers that are compatible with nvcc */
|
| 79 |
+
#if !defined(__GNUC__) && !defined(_WIN32)
|
| 80 |
+
|
| 81 |
+
#error --- !!! UNSUPPORTED COMPILER !!! ---
|
| 82 |
+
|
| 83 |
+
#endif /* !__GNUC__ && !_WIN32 */
|
| 84 |
+
|
| 85 |
+
/* check invalid configurations */
|
| 86 |
+
#if defined(__PGIC__)
|
| 87 |
+
#if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
|
| 88 |
+
#error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
|
| 89 |
+
#endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
|
| 90 |
+
#endif /* defined(__PGIC__) */
|
| 91 |
+
|
| 92 |
+
#if defined(__powerpc__)
|
| 93 |
+
#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
|
| 94 |
+
#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
|
| 95 |
+
#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
|
| 96 |
+
#endif /* __powerpc__ */
|
| 97 |
+
|
| 98 |
+
#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
|
| 99 |
+
#error -- clang and clang++ are the only supported host compilers on Mac OS X!
|
| 100 |
+
#endif /* __APPLE__ && __MACH__ && !__clang__ */
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
/* check host compiler version */
|
| 104 |
+
#if !__NV_NO_HOST_COMPILER_CHECK
|
| 105 |
+
|
| 106 |
+
#if defined(__ICC)
|
| 107 |
+
|
| 108 |
+
#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
|
| 109 |
+
|
| 110 |
+
#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 111 |
+
|
| 112 |
+
#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
|
| 113 |
+
|
| 114 |
+
#endif /* __ICC */
|
| 115 |
+
|
| 116 |
+
#if defined(__GRCO_CLANG_COMPILER__)
|
| 117 |
+
#if (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 17))
|
| 118 |
+
#error -- unsupported Grace clang version! The version must be 16.x to 17.x. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 119 |
+
#endif /* (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 17)) */
|
| 120 |
+
|
| 121 |
+
#endif /* __GRCO_CLANG_COMPILER__ */
|
| 122 |
+
|
| 123 |
+
#if defined(__INTEL_CLANG_COMPILER)
|
| 124 |
+
#error -- unsupported Intel ICX compiler! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 125 |
+
#endif /* __INTEL_CLANG_COMPILER */
|
| 126 |
+
|
| 127 |
+
#if defined(__powerpc__)
|
| 128 |
+
|
| 129 |
+
#if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
|
| 130 |
+
!(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
|
| 131 |
+
|
| 132 |
+
#error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 133 |
+
|
| 134 |
+
#endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
|
| 135 |
+
!(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
|
| 136 |
+
|
| 137 |
+
#endif /* __powerpc__ */
|
| 138 |
+
|
| 139 |
+
#if defined(__GNUC__)
|
| 140 |
+
|
| 141 |
+
#if __GNUC__ > 13
|
| 142 |
+
|
| 143 |
+
#error -- unsupported GNU version! gcc versions later than 13 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 144 |
+
|
| 145 |
+
#endif /* __GNUC__ > 13 */
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
#if defined(__HORIZON__)
|
| 149 |
+
#if (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3))
|
| 150 |
+
#error -- unsupported HOS clang version! The version must be must be less than 18 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 151 |
+
#endif /* (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3)) */
|
| 152 |
+
#endif /* __HORIZON__ */
|
| 153 |
+
|
| 154 |
+
#if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__)
|
| 155 |
+
|
| 156 |
+
#if (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3))
|
| 157 |
+
#error -- unsupported clang version! clang version must be less than 18 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 158 |
+
|
| 159 |
+
#endif /* (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3)) */
|
| 160 |
+
|
| 161 |
+
#endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__) */
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
#endif /* __GNUC__ */
|
| 165 |
+
|
| 166 |
+
#if defined(_WIN32)
|
| 167 |
+
|
| 168 |
+
#if _MSC_VER < 1910 || _MSC_VER >= 1950
|
| 169 |
+
|
| 170 |
+
#error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
|
| 171 |
+
|
| 172 |
+
#elif _MSC_VER >= 1910 && _MSC_VER < 1910
|
| 173 |
+
|
| 174 |
+
#pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
|
| 175 |
+
|
| 176 |
+
#endif /* (_MSC_VER < 1910 || _MSC_VER >= 1950) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
|
| 177 |
+
|
| 178 |
+
#endif /* _WIN32 */
|
| 179 |
+
#endif /* !__NV_NO_HOST_COMPILER_CHECK */
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
/* configure host compiler */
|
| 183 |
+
#if defined(__APPLE__)
|
| 184 |
+
|
| 185 |
+
#define _CRTIMP
|
| 186 |
+
#define _ACRTIMP
|
| 187 |
+
#define __THROW
|
| 188 |
+
|
| 189 |
+
#if defined(__BLOCKS__) /* nvcc does not support closures */
|
| 190 |
+
|
| 191 |
+
#undef __BLOCKS__
|
| 192 |
+
|
| 193 |
+
#endif /* __BLOCKS__ */
|
| 194 |
+
|
| 195 |
+
#elif defined(__ANDROID__)
|
| 196 |
+
|
| 197 |
+
#define _CRTIMP
|
| 198 |
+
#define _ACRTIMP
|
| 199 |
+
#define __THROW
|
| 200 |
+
|
| 201 |
+
#elif defined(__QNX__)
|
| 202 |
+
|
| 203 |
+
#define _CRTIMP
|
| 204 |
+
#define _ACRTIMP
|
| 205 |
+
#define __THROW
|
| 206 |
+
|
| 207 |
+
#elif defined(__HORIZON__)
|
| 208 |
+
|
| 209 |
+
#define _CRTIMP
|
| 210 |
+
#define _ACRTIMP
|
| 211 |
+
#define __THROW
|
| 212 |
+
|
| 213 |
+
#elif defined(__GNUC__)
|
| 214 |
+
|
| 215 |
+
#define _CRTIMP
|
| 216 |
+
#define _ACRTIMP
|
| 217 |
+
|
| 218 |
+
#include <features.h> /* for __THROW */
|
| 219 |
+
|
| 220 |
+
#elif defined(_WIN32)
|
| 221 |
+
|
| 222 |
+
#if _MSC_VER >= 1500
|
| 223 |
+
|
| 224 |
+
#undef _USE_DECLSPECS_FOR_SAL
|
| 225 |
+
#define _USE_DECLSPECS_FOR_SAL \
|
| 226 |
+
1
|
| 227 |
+
|
| 228 |
+
#endif /* _MSC_VER >= 1500 */
|
| 229 |
+
|
| 230 |
+
#if !defined(_CRT_NONSTDC_NO_WARNINGS)
|
| 231 |
+
|
| 232 |
+
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
|
| 233 |
+
|
| 234 |
+
#endif /* !_CRT_NONSTDC_NO_WARNINGS */
|
| 235 |
+
|
| 236 |
+
#if !defined(_CRT_SECURE_NO_WARNINGS)
|
| 237 |
+
|
| 238 |
+
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
|
| 239 |
+
|
| 240 |
+
#endif /* !_CRT_SECURE_NO_WARNINGS */
|
| 241 |
+
|
| 242 |
+
#if !defined(NOMINMAX)
|
| 243 |
+
|
| 244 |
+
#define NOMINMAX /* min and max are part of cuda runtime */
|
| 245 |
+
|
| 246 |
+
#endif /* !NOMINMAX */
|
| 247 |
+
|
| 248 |
+
#include <crtdefs.h> /* for _CRTIMP */
|
| 249 |
+
#if _MSC_VER >= 1900
|
| 250 |
+
#include <corecrt.h> /* for _ACRTIMP */
|
| 251 |
+
#endif /* _MSC_VER >= 1900 */
|
| 252 |
+
|
| 253 |
+
#define __THROW
|
| 254 |
+
|
| 255 |
+
#endif /* __APPLE__ */
|
| 256 |
+
|
| 257 |
+
#endif /* __CUDACC_RTC__ */
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
|
| 261 |
+
|
| 262 |
+
#if __CUDACC_RTC__
|
| 263 |
+
typedef char *va_list;
|
| 264 |
+
#else /* !__CUDACC_RTC__ */
|
| 265 |
+
#include <cstdarg>
|
| 266 |
+
#endif /* __CUDACC_RTC__ */
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
#undef va_start
|
| 270 |
+
#undef va_end
|
| 271 |
+
#undef va_arg
|
| 272 |
+
|
| 273 |
+
#ifdef __PGIC__
|
| 274 |
+
|
| 275 |
+
#undef __builtin_va_end
|
| 276 |
+
|
| 277 |
+
#define va_start(v,l) __builtin_alt_va_start(v,l)
|
| 278 |
+
#define va_end(v) __builtin_va_end(v)
|
| 279 |
+
#define va_arg(v,l) __builtin_alt_va_arg(v,l)
|
| 280 |
+
|
| 281 |
+
#if (__cplusplus >= 201103L)
|
| 282 |
+
#undef va_copy
|
| 283 |
+
#define va_copy(d,s) __builtin_va_copy(d,s)
|
| 284 |
+
#endif
|
| 285 |
+
|
| 286 |
+
#else /* !__PGIC__ */
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
#define va_start(ap, x) (__cu_va_start(&ap, x))
|
| 290 |
+
#define va_end(ap) (__cu_va_end(&ap))
|
| 291 |
+
#define va_arg(ap, t) (*((t *)__cu_va_arg(&ap, (t *)0)))
|
| 292 |
+
|
| 293 |
+
#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
|
| 294 |
+
#undef va_copy
|
| 295 |
+
#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
|
| 296 |
+
#endif /* (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
|
| 297 |
+
#endif /* __PGIC__ */
|
| 298 |
+
|
| 299 |
+
#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
#endif /* __CUDACC__ */
|
| 304 |
+
|
| 305 |
+
#endif /* !__HOST_CONFIG_H__ */
|
| 306 |
+
|
| 307 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
|
| 308 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 309 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
|
| 310 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_defines.h
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/host_defines.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/host_defines.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__HOST_DEFINES_H__)
|
| 61 |
+
#define __HOST_DEFINES_H__
|
| 62 |
+
|
| 63 |
+
#if defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDADEVRT_INTERNAL__) && !defined(_ALLOW_UNSUPPORTED_LIBCPP)
|
| 64 |
+
#include <ctype.h>
|
| 65 |
+
#if ((defined(_MSC_VER ) && (defined(_M_X64) || defined(_M_AMD64))) ||\
|
| 66 |
+
(defined(__x86_64__) || defined(__amd64__))) && defined(_LIBCPP_VERSION) && !(defined(__HORIZON__) || defined(__ANDROID__) || defined(__QNX__))
|
| 67 |
+
#error "libc++ is not supported on x86 system"
|
| 68 |
+
#endif
|
| 69 |
+
#endif
|
| 70 |
+
|
| 71 |
+
/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
|
| 72 |
+
#if defined(__GNUC__) || (defined(__PGIC__) && defined(__linux__)) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
|
| 73 |
+
|
| 74 |
+
#if defined(__CUDACC_RTC__)
|
| 75 |
+
#define __volatile__ volatile
|
| 76 |
+
#endif /* __CUDACC_RTC__ */
|
| 77 |
+
|
| 78 |
+
#define __no_return__ \
|
| 79 |
+
__attribute__((noreturn))
|
| 80 |
+
|
| 81 |
+
#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
|
| 82 |
+
/* gcc allows users to define attributes with underscores,
|
| 83 |
+
e.g., __attribute__((__noinline__)).
|
| 84 |
+
Consider a non-CUDA source file (e.g. .cpp) that has the
|
| 85 |
+
above attribute specification, and includes this header file. In that case,
|
| 86 |
+
defining __noinline__ as below would cause a gcc compilation error.
|
| 87 |
+
Hence, only define __noinline__ when the code is being processed
|
| 88 |
+
by a CUDA compiler component.
|
| 89 |
+
*/
|
| 90 |
+
#define __noinline__ \
|
| 91 |
+
__attribute__((noinline))
|
| 92 |
+
#endif /* __CUDACC__ || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
|
| 93 |
+
|
| 94 |
+
#undef __forceinline__
|
| 95 |
+
#define __forceinline__ \
|
| 96 |
+
__inline__ __attribute__((always_inline))
|
| 97 |
+
#define __inline_hint__ \
|
| 98 |
+
__attribute__((nv_inline_hint))
|
| 99 |
+
#define __align__(n) \
|
| 100 |
+
__attribute__((aligned(n)))
|
| 101 |
+
#define __maxnreg__(a) \
|
| 102 |
+
__attribute__((maxnreg(a)))
|
| 103 |
+
#define __thread__ \
|
| 104 |
+
__thread
|
| 105 |
+
#define __import__
|
| 106 |
+
#define __export__
|
| 107 |
+
#define __cdecl
|
| 108 |
+
#define __annotate__(a) \
|
| 109 |
+
__attribute__((a))
|
| 110 |
+
#define __location__(a) \
|
| 111 |
+
__annotate__(a)
|
| 112 |
+
#define CUDARTAPI
|
| 113 |
+
#define CUDARTAPI_CDECL
|
| 114 |
+
|
| 115 |
+
#elif defined(_MSC_VER)
|
| 116 |
+
|
| 117 |
+
#if _MSC_VER >= 1400
|
| 118 |
+
|
| 119 |
+
#define __restrict__ \
|
| 120 |
+
__restrict
|
| 121 |
+
|
| 122 |
+
#else /* _MSC_VER >= 1400 */
|
| 123 |
+
|
| 124 |
+
#define __restrict__
|
| 125 |
+
|
| 126 |
+
#endif /* _MSC_VER >= 1400 */
|
| 127 |
+
|
| 128 |
+
#define __inline__ \
|
| 129 |
+
__inline
|
| 130 |
+
#define __no_return__ \
|
| 131 |
+
__declspec(noreturn)
|
| 132 |
+
#define __noinline__ \
|
| 133 |
+
__declspec(noinline)
|
| 134 |
+
#define __forceinline__ \
|
| 135 |
+
__forceinline
|
| 136 |
+
#define __inline_hint__ \
|
| 137 |
+
__declspec(nv_inline_hint)
|
| 138 |
+
#define __align__(n) \
|
| 139 |
+
__declspec(align(n))
|
| 140 |
+
#define __maxnreg__(n) \
|
| 141 |
+
__declspec(maxnreg(n))
|
| 142 |
+
#define __thread__ \
|
| 143 |
+
__declspec(thread)
|
| 144 |
+
#define __import__ \
|
| 145 |
+
__declspec(dllimport)
|
| 146 |
+
#define __export__ \
|
| 147 |
+
__declspec(dllexport)
|
| 148 |
+
#define __annotate__(a) \
|
| 149 |
+
__declspec(a)
|
| 150 |
+
#define __location__(a) \
|
| 151 |
+
__annotate__(__##a##__)
|
| 152 |
+
#define CUDARTAPI \
|
| 153 |
+
__stdcall
|
| 154 |
+
#define CUDARTAPI_CDECL \
|
| 155 |
+
__cdecl
|
| 156 |
+
|
| 157 |
+
#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
|
| 158 |
+
|
| 159 |
+
#define __inline__
|
| 160 |
+
|
| 161 |
+
#if !defined(__align__)
|
| 162 |
+
|
| 163 |
+
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
|
| 164 |
+
|
| 165 |
+
#endif /* !__align__ */
|
| 166 |
+
|
| 167 |
+
#if !defined(CUDARTAPI)
|
| 168 |
+
|
| 169 |
+
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
|
| 170 |
+
|
| 171 |
+
#endif /* !CUDARTAPI */
|
| 172 |
+
|
| 173 |
+
#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
|
| 174 |
+
|
| 175 |
+
#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
|
| 176 |
+
(defined(_MSC_VER) && _MSC_VER < 1900) || \
|
| 177 |
+
(!defined(__GNUC__) && !defined(_MSC_VER))
|
| 178 |
+
|
| 179 |
+
#define __specialization_static \
|
| 180 |
+
static
|
| 181 |
+
|
| 182 |
+
#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
|
| 183 |
+
(_MSC_VER && _MSC_VER < 1900) ||
|
| 184 |
+
(!__GNUC__ && !_MSC_VER) */
|
| 185 |
+
|
| 186 |
+
#define __specialization_static
|
| 187 |
+
|
| 188 |
+
#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
|
| 189 |
+
(_MSC_VER && _MSC_VER < 1900) ||
|
| 190 |
+
(!__GNUC__ && !_MSC_VER) */
|
| 191 |
+
|
| 192 |
+
#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
|
| 193 |
+
|
| 194 |
+
#undef __annotate__
|
| 195 |
+
#define __annotate__(a)
|
| 196 |
+
|
| 197 |
+
#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
|
| 198 |
+
|
| 199 |
+
#define __launch_bounds__(...) \
|
| 200 |
+
__annotate__(launch_bounds(__VA_ARGS__))
|
| 201 |
+
|
| 202 |
+
#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
|
| 203 |
+
|
| 204 |
+
#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
|
| 205 |
+
defined(__GNUC__) || defined(_WIN64)
|
| 206 |
+
|
| 207 |
+
#define __builtin_align__(a) \
|
| 208 |
+
__align__(a)
|
| 209 |
+
|
| 210 |
+
#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
|
| 211 |
+
|
| 212 |
+
#define __builtin_align__(a)
|
| 213 |
+
|
| 214 |
+
#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
|
| 215 |
+
|
| 216 |
+
#if defined(__CUDACC__) || !defined(__grid_constant__)
|
| 217 |
+
#define __grid_constant__ \
|
| 218 |
+
__location__(grid_constant)
|
| 219 |
+
#endif /* defined(__CUDACC__) || !defined(__grid_constant__) */
|
| 220 |
+
|
| 221 |
+
#if defined(__CUDACC__) || !defined(__host__)
|
| 222 |
+
#define __host__ \
|
| 223 |
+
__location__(host)
|
| 224 |
+
#endif /* defined(__CUDACC__) || !defined(__host__) */
|
| 225 |
+
#if defined(__CUDACC__) || !defined(__device__)
|
| 226 |
+
#define __device__ \
|
| 227 |
+
__location__(device)
|
| 228 |
+
#endif /* defined(__CUDACC__) || !defined(__device__) */
|
| 229 |
+
#if defined(__CUDACC__) || !defined(__global__)
|
| 230 |
+
#define __global__ \
|
| 231 |
+
__location__(global)
|
| 232 |
+
#endif /* defined(__CUDACC__) || !defined(__global__) */
|
| 233 |
+
#if defined(__CUDACC__) || !defined(__shared__)
|
| 234 |
+
#define __shared__ \
|
| 235 |
+
__location__(shared)
|
| 236 |
+
#endif /* defined(__CUDACC__) || !defined(__shared__) */
|
| 237 |
+
#if defined(__CUDACC__) || !defined(__constant__)
|
| 238 |
+
#define __constant__ \
|
| 239 |
+
__location__(constant)
|
| 240 |
+
#endif /* defined(__CUDACC__) || !defined(__constant__) */
|
| 241 |
+
#if defined(__CUDACC__) || !defined(__managed__)
|
| 242 |
+
#define __managed__ \
|
| 243 |
+
__location__(managed)
|
| 244 |
+
#endif /* defined(__CUDACC__) || !defined(__managed__) */
|
| 245 |
+
|
| 246 |
+
#if !defined(__CUDACC__)
|
| 247 |
+
#define __device_builtin__
|
| 248 |
+
#define __device_builtin_texture_type__
|
| 249 |
+
#define __device_builtin_surface_type__
|
| 250 |
+
#define __cudart_builtin__
|
| 251 |
+
#else /* defined(__CUDACC__) */
|
| 252 |
+
#define __device_builtin__ \
|
| 253 |
+
__location__(device_builtin)
|
| 254 |
+
#define __device_builtin_texture_type__ \
|
| 255 |
+
__location__(device_builtin_texture_type)
|
| 256 |
+
#define __device_builtin_surface_type__ \
|
| 257 |
+
__location__(device_builtin_surface_type)
|
| 258 |
+
#define __cudart_builtin__ \
|
| 259 |
+
__location__(cudart_builtin)
|
| 260 |
+
#endif /* !defined(__CUDACC__) */
|
| 261 |
+
|
| 262 |
+
#if defined(__CUDACC__) || !defined(__cluster_dims__)
|
| 263 |
+
#if defined(_MSC_VER)
|
| 264 |
+
#define __cluster_dims__(...) \
|
| 265 |
+
__declspec(__cluster_dims__(__VA_ARGS__))
|
| 266 |
+
|
| 267 |
+
#else /* !defined(_MSC_VER) */
|
| 268 |
+
#define __cluster_dims__(...) \
|
| 269 |
+
__attribute__((cluster_dims(__VA_ARGS__)))
|
| 270 |
+
#endif /* defined(_MSC_VER) */
|
| 271 |
+
#endif /* defined(__CUDACC__) || !defined(__cluster_dims__) */
|
| 272 |
+
|
| 273 |
+
#define __CUDA_ARCH_HAS_FEATURE__(_FEAT) __CUDA_ARCH_FEAT_##_FEAT
|
| 274 |
+
|
| 275 |
+
#endif /* !__HOST_DEFINES_H__ */
|
| 276 |
+
|
| 277 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__)
|
| 278 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 279 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
|
| 280 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_runtime.h
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* NVIDIA_COPYRIGHT_BEGIN
|
| 3 |
+
*
|
| 4 |
+
* Copyright (c) 2008-2023, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
+
*
|
| 6 |
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 7 |
+
* and proprietary rights in and to this software, related documentation
|
| 8 |
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
| 9 |
+
* distribution of this software and related documentation without an express
|
| 10 |
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 11 |
+
*
|
| 12 |
+
* NVIDIA_COPYRIGHT_END
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 16 |
+
#if defined(_MSC_VER)
|
| 17 |
+
#pragma message("crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 18 |
+
#else
|
| 19 |
+
#warning "crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 20 |
+
#endif
|
| 21 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 22 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#if !defined(__CUDA_INTERNAL_COMPILATION__)
|
| 26 |
+
|
| 27 |
+
#define __CUDA_INTERNAL_COMPILATION__
|
| 28 |
+
#define __text__
|
| 29 |
+
#define __surf__
|
| 30 |
+
#define __name__shadow_var(c, cpp) \
|
| 31 |
+
#c
|
| 32 |
+
#define __name__text_var(c, cpp) \
|
| 33 |
+
#cpp
|
| 34 |
+
#define __host__shadow_var(c, cpp) \
|
| 35 |
+
cpp
|
| 36 |
+
#define __text_var(c, cpp) \
|
| 37 |
+
cpp
|
| 38 |
+
#define __device_fun(fun) \
|
| 39 |
+
#fun
|
| 40 |
+
#define __device_var(var) \
|
| 41 |
+
#var
|
| 42 |
+
#define __device__text_var(c, cpp) \
|
| 43 |
+
#c
|
| 44 |
+
#define __device__shadow_var(c, cpp) \
|
| 45 |
+
#c
|
| 46 |
+
|
| 47 |
+
#if defined(_WIN32) && !defined(_WIN64)
|
| 48 |
+
|
| 49 |
+
#define __pad__(f) \
|
| 50 |
+
f
|
| 51 |
+
|
| 52 |
+
#else /* _WIN32 && !_WIN64 */
|
| 53 |
+
|
| 54 |
+
#define __pad__(f)
|
| 55 |
+
|
| 56 |
+
#endif /* _WIN32 && !_WIN64 */
|
| 57 |
+
|
| 58 |
+
#include "builtin_types.h"
|
| 59 |
+
#include "storage_class.h"
|
| 60 |
+
|
| 61 |
+
#else /* !__CUDA_INTERNAL_COMPILATION__ */
|
| 62 |
+
|
| 63 |
+
template <typename T>
|
| 64 |
+
static inline T *__cudaAddressOf(T &val)
|
| 65 |
+
{
|
| 66 |
+
return (T *)((void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(val)))));
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
#define __cudaRegisterBinary(X) \
|
| 70 |
+
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
|
| 71 |
+
{ void (*callback_fp)(void **) = (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
|
| 72 |
+
atexit(__cudaUnregisterBinaryUtil)
|
| 73 |
+
|
| 74 |
+
#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
|
| 75 |
+
__cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
|
| 76 |
+
#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
|
| 77 |
+
__cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
|
| 78 |
+
|
| 79 |
+
#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
|
| 80 |
+
__cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
|
| 81 |
+
#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
|
| 82 |
+
__cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
|
| 83 |
+
#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
|
| 84 |
+
__cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
|
| 85 |
+
|
| 86 |
+
extern "C" cudaError_t CUDARTAPI __cudaPopCallConfiguration(
|
| 87 |
+
dim3 *gridDim,
|
| 88 |
+
dim3 *blockDim,
|
| 89 |
+
size_t *sharedMem,
|
| 90 |
+
void *stream
|
| 91 |
+
);
|
| 92 |
+
|
| 93 |
+
#define __cudaLaunchPrologue(size) \
|
| 94 |
+
void * __args_arr[size]; \
|
| 95 |
+
int __args_idx = 0
|
| 96 |
+
|
| 97 |
+
#define __cudaSetupArg(arg, offset) \
|
| 98 |
+
__args_arr[__args_idx] = (void *)__cudaAddressOf(arg); ++__args_idx
|
| 99 |
+
|
| 100 |
+
#define __cudaSetupArgSimple(arg, offset) \
|
| 101 |
+
__args_arr[__args_idx] = (void *)(char *)&arg; ++__args_idx
|
| 102 |
+
|
| 103 |
+
#if defined(__GNUC__)
|
| 104 |
+
#define __NV_ATTR_UNUSED_FOR_LAUNCH __attribute__((unused))
|
| 105 |
+
#else /* !__GNUC__ */
|
| 106 |
+
#define __NV_ATTR_UNUSED_FOR_LAUNCH
|
| 107 |
+
#endif /* __GNUC__ */
|
| 108 |
+
|
| 109 |
+
#ifdef __NV_LEGACY_LAUNCH
|
| 110 |
+
/* the use of __args_idx in the expression below avoids host compiler warning about it being an
|
| 111 |
+
unused variable when the launch has no arguments */
|
| 112 |
+
#define __cudaLaunch(fun) \
|
| 113 |
+
{ volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH; __f = fun; \
|
| 114 |
+
dim3 __gridDim, __blockDim;\
|
| 115 |
+
size_t __sharedMem; \
|
| 116 |
+
cudaStream_t __stream; \
|
| 117 |
+
if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
|
| 118 |
+
return; \
|
| 119 |
+
if (__args_idx == 0) {\
|
| 120 |
+
(void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
|
| 121 |
+
} else { \
|
| 122 |
+
(void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
|
| 123 |
+
}\
|
| 124 |
+
}
|
| 125 |
+
#else /* !__NV_LEGACY_LAUNCH */
|
| 126 |
+
#define __cudaLaunch(fun) \
|
| 127 |
+
{ volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH; __f = fun; \
|
| 128 |
+
static cudaKernel_t __handle = 0; \
|
| 129 |
+
volatile static bool __tmp __NV_ATTR_UNUSED_FOR_LAUNCH = (__cudaGetKernel(&__handle, (const void *)fun) == cudaSuccess); \
|
| 130 |
+
dim3 __gridDim, __blockDim;\
|
| 131 |
+
size_t __sharedMem; \
|
| 132 |
+
cudaStream_t __stream; \
|
| 133 |
+
if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
|
| 134 |
+
return; \
|
| 135 |
+
if (__args_idx == 0) {\
|
| 136 |
+
(void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
|
| 137 |
+
} else { \
|
| 138 |
+
(void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
|
| 139 |
+
}\
|
| 140 |
+
}
|
| 141 |
+
#endif /* __NV_LEGACY_LAUNCH */
|
| 142 |
+
|
| 143 |
+
#if defined(__GNUC__)
|
| 144 |
+
#define __nv_dummy_param_ref(param) \
|
| 145 |
+
{ volatile static void **__ref __attribute__((unused)); __ref = (volatile void **)param; }
|
| 146 |
+
#else /* __GNUC__ */
|
| 147 |
+
#define __nv_dummy_param_ref(param) \
|
| 148 |
+
{ volatile static void **__ref; __ref = (volatile void **)param; }
|
| 149 |
+
#endif /* __GNUC__ */
|
| 150 |
+
|
| 151 |
+
static void ____nv_dummy_param_ref(void *param) __nv_dummy_param_ref(param)
|
| 152 |
+
|
| 153 |
+
#define __REGISTERFUNCNAME_CORE(X) __cudaRegisterLinkedBinary##X
|
| 154 |
+
#define __REGISTERFUNCNAME(X) __REGISTERFUNCNAME_CORE(X)
|
| 155 |
+
|
| 156 |
+
extern "C" {
|
| 157 |
+
void __REGISTERFUNCNAME( __NV_MODULE_ID ) ( void (*)(void **), void *, void *, void (*)(void *));
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
#define __TO_STRING_CORE(X) #X
|
| 161 |
+
#define __TO_STRING(X) __TO_STRING_CORE(X)
|
| 162 |
+
|
| 163 |
+
extern "C" {
|
| 164 |
+
#if defined(_WIN32)
|
| 165 |
+
#pragma data_seg("__nv_module_id")
|
| 166 |
+
static const __declspec(allocate("__nv_module_id")) unsigned char __module_id_str[] = __TO_STRING(__NV_MODULE_ID);
|
| 167 |
+
#pragma data_seg()
|
| 168 |
+
#elif defined(__APPLE__)
|
| 169 |
+
static const unsigned char __module_id_str[] __attribute__((section ("__NV_CUDA,__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
|
| 170 |
+
#else
|
| 171 |
+
static const unsigned char __module_id_str[] __attribute__((section ("__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
|
| 172 |
+
#endif
|
| 173 |
+
|
| 174 |
+
#undef __FATIDNAME_CORE
|
| 175 |
+
#undef __FATIDNAME
|
| 176 |
+
#define __FATIDNAME_CORE(X) __fatbinwrap##X
|
| 177 |
+
#define __FATIDNAME(X) __FATIDNAME_CORE(X)
|
| 178 |
+
|
| 179 |
+
#define ____cudaRegisterLinkedBinary(X) \
|
| 180 |
+
{ __REGISTERFUNCNAME(__NV_MODULE_ID) (( void (*)(void **))(X), (void *)&__FATIDNAME(__NV_MODULE_ID), (void *)&__module_id_str, (void (*)(void *))&____nv_dummy_param_ref); }
|
| 181 |
+
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
extern "C" {
|
| 185 |
+
extern void** CUDARTAPI __cudaRegisterFatBinary(
|
| 186 |
+
void *fatCubin
|
| 187 |
+
);
|
| 188 |
+
|
| 189 |
+
extern void CUDARTAPI __cudaRegisterFatBinaryEnd(
|
| 190 |
+
void **fatCubinHandle
|
| 191 |
+
);
|
| 192 |
+
|
| 193 |
+
extern void CUDARTAPI __cudaUnregisterFatBinary(
|
| 194 |
+
void **fatCubinHandle
|
| 195 |
+
);
|
| 196 |
+
|
| 197 |
+
extern void CUDARTAPI __cudaRegisterVar(
|
| 198 |
+
void **fatCubinHandle,
|
| 199 |
+
char *hostVar,
|
| 200 |
+
char *deviceAddress,
|
| 201 |
+
const char *deviceName,
|
| 202 |
+
int ext,
|
| 203 |
+
size_t size,
|
| 204 |
+
int constant,
|
| 205 |
+
int global
|
| 206 |
+
);
|
| 207 |
+
|
| 208 |
+
extern void CUDARTAPI __cudaRegisterManagedVar(
|
| 209 |
+
void **fatCubinHandle,
|
| 210 |
+
void **hostVarPtrAddress,
|
| 211 |
+
char *deviceAddress,
|
| 212 |
+
const char *deviceName,
|
| 213 |
+
int ext,
|
| 214 |
+
size_t size,
|
| 215 |
+
int constant,
|
| 216 |
+
int global
|
| 217 |
+
);
|
| 218 |
+
|
| 219 |
+
extern char CUDARTAPI __cudaInitModule(
|
| 220 |
+
void **fatCubinHandle
|
| 221 |
+
);
|
| 222 |
+
|
| 223 |
+
extern void CUDARTAPI __cudaRegisterTexture(
|
| 224 |
+
void **fatCubinHandle,
|
| 225 |
+
const struct textureReference *hostVar,
|
| 226 |
+
const void **deviceAddress,
|
| 227 |
+
const char *deviceName,
|
| 228 |
+
int dim,
|
| 229 |
+
int norm,
|
| 230 |
+
int ext
|
| 231 |
+
);
|
| 232 |
+
|
| 233 |
+
extern void CUDARTAPI __cudaRegisterSurface(
|
| 234 |
+
void **fatCubinHandle,
|
| 235 |
+
const struct surfaceReference *hostVar,
|
| 236 |
+
const void **deviceAddress,
|
| 237 |
+
const char *deviceName,
|
| 238 |
+
int dim,
|
| 239 |
+
int ext
|
| 240 |
+
);
|
| 241 |
+
|
| 242 |
+
extern void CUDARTAPI __cudaRegisterFunction(
|
| 243 |
+
void **fatCubinHandle,
|
| 244 |
+
const char *hostFun,
|
| 245 |
+
char *deviceFun,
|
| 246 |
+
const char *deviceName,
|
| 247 |
+
int thread_limit,
|
| 248 |
+
uint3 *tid,
|
| 249 |
+
uint3 *bid,
|
| 250 |
+
dim3 *bDim,
|
| 251 |
+
dim3 *gDim,
|
| 252 |
+
int *wSize
|
| 253 |
+
);
|
| 254 |
+
|
| 255 |
+
#if defined(__APPLE__)
|
| 256 |
+
extern "C" int atexit(void (*)(void));
|
| 257 |
+
|
| 258 |
+
#elif defined(__GNUC__) && !defined(__ANDROID__) && !defined(__HORIZON__)
|
| 259 |
+
extern int atexit(void(*)(void)) throw();
|
| 260 |
+
|
| 261 |
+
#elif defined(__HORIZON__)
|
| 262 |
+
|
| 263 |
+
// __TEMP_WAR__ 200132570 HOS : Disable atexit call until it works
|
| 264 |
+
#define atexit(p)
|
| 265 |
+
|
| 266 |
+
#else /* __GNUC__ && !__ANDROID__ */
|
| 267 |
+
extern int __cdecl atexit(void(__cdecl *)(void));
|
| 268 |
+
#endif
|
| 269 |
+
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
static void **__cudaFatCubinHandle;
|
| 273 |
+
|
| 274 |
+
static void __cdecl __cudaUnregisterBinaryUtil(void)
|
| 275 |
+
{
|
| 276 |
+
____nv_dummy_param_ref((void *)&__cudaFatCubinHandle);
|
| 277 |
+
__cudaUnregisterFatBinary(__cudaFatCubinHandle);
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
static char __nv_init_managed_rt_with_module(void **handle)
|
| 281 |
+
{
|
| 282 |
+
return __cudaInitModule(handle);
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
#include "common_functions.h"
|
| 286 |
+
|
| 287 |
+
#pragma pack()
|
| 288 |
+
|
| 289 |
+
#if defined(_WIN32)
|
| 290 |
+
|
| 291 |
+
#pragma warning(disable: 4099)
|
| 292 |
+
|
| 293 |
+
#if !defined(_WIN64)
|
| 294 |
+
|
| 295 |
+
#pragma warning(disable: 4408)
|
| 296 |
+
|
| 297 |
+
#endif /* !_WIN64 */
|
| 298 |
+
|
| 299 |
+
#endif /* _WIN32 */
|
| 300 |
+
|
| 301 |
+
#endif /* !__CUDA_INTERNAL_COMPILATION__ */
|
| 302 |
+
|
| 303 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__)
|
| 304 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 305 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
|
| 306 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.h
ADDED
|
@@ -0,0 +1,754 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2020 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/mma.h is an internal header file and must not be used directly. Please use mma.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/mma.h is an internal header file and must not be used directly. Please use mma.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__CUDA_MMA_H__)
|
| 61 |
+
#define __CUDA_MMA_H__
|
| 62 |
+
|
| 63 |
+
#include <cuda_fp16.h>
|
| 64 |
+
#include <cuda_bf16.h>
|
| 65 |
+
|
| 66 |
+
#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
|
| 67 |
+
|
| 68 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 69 |
+
|
| 70 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
|
| 74 |
+
#define __DEF_IF_HOST { }
|
| 75 |
+
#else /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
|
| 76 |
+
#define __DEF_IF_HOST ;
|
| 77 |
+
#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
|
| 78 |
+
|
| 79 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
|
| 80 |
+
#define __CUDA_IMMA__ 1
|
| 81 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
|
| 82 |
+
|
| 83 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
|
| 84 |
+
#define __CUDA_SUBBYTE_IMMA__ 1
|
| 85 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
|
| 86 |
+
|
| 87 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
|
| 88 |
+
#define __CUDA_AMPERE_MMA__ 1
|
| 89 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
|
| 90 |
+
|
| 91 |
+
namespace nvcuda {
|
| 92 |
+
namespace wmma {
|
| 93 |
+
|
| 94 |
+
// utility functions
|
| 95 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 96 |
+
inline __device__ float __float_to_tf32(float in)
|
| 97 |
+
{
|
| 98 |
+
float ret;
|
| 99 |
+
asm("{\n .reg .b32 __$1;"
|
| 100 |
+
"\n cvt.rna.tf32.f32 __$1, %1;"
|
| 101 |
+
"\n mov.b32 %0, __$1;\n}\n" : "=f"(ret) : "f"(in) );
|
| 102 |
+
return ret;
|
| 103 |
+
}
|
| 104 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 105 |
+
|
| 106 |
+
//
|
| 107 |
+
// tags
|
| 108 |
+
//
|
| 109 |
+
struct row_major;
|
| 110 |
+
struct col_major;
|
| 111 |
+
struct matrix_a;
|
| 112 |
+
struct matrix_b;
|
| 113 |
+
struct accumulator;
|
| 114 |
+
|
| 115 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 116 |
+
namespace precision {
|
| 117 |
+
struct tf32;
|
| 118 |
+
}
|
| 119 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 120 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 121 |
+
namespace experimental {
|
| 122 |
+
namespace precision {
|
| 123 |
+
struct u4; // 4-bit unsigned
|
| 124 |
+
struct s4; // 4-bit signed
|
| 125 |
+
struct b1; // 1-bit
|
| 126 |
+
}
|
| 127 |
+
enum bmmaBitOp { bmmaBitOpXOR = 1
|
| 128 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 129 |
+
, bmmaBitOpAND = 2
|
| 130 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 131 |
+
};
|
| 132 |
+
enum bmmaAccumulateOp { bmmaAccumulateOpPOPC = 1 };
|
| 133 |
+
}
|
| 134 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 135 |
+
|
| 136 |
+
//
|
| 137 |
+
// layout
|
| 138 |
+
//
|
| 139 |
+
enum layout_t {
|
| 140 |
+
mem_row_major, mem_col_major
|
| 141 |
+
};
|
| 142 |
+
|
| 143 |
+
template <typename T>
|
| 144 |
+
struct helper_traits {
|
| 145 |
+
typedef T element_type;
|
| 146 |
+
typedef T storage_element_type;
|
| 147 |
+
typedef T fill_argument_type;
|
| 148 |
+
};
|
| 149 |
+
|
| 150 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 151 |
+
template<> struct helper_traits<experimental::precision::u4> {
|
| 152 |
+
typedef experimental::precision::u4 element_type;
|
| 153 |
+
typedef unsigned int storage_element_type;
|
| 154 |
+
typedef unsigned int fill_argument_type;
|
| 155 |
+
};
|
| 156 |
+
|
| 157 |
+
template<> struct helper_traits<experimental::precision::s4> {
|
| 158 |
+
typedef experimental::precision::s4 element_type;
|
| 159 |
+
typedef int storage_element_type;
|
| 160 |
+
typedef int fill_argument_type;
|
| 161 |
+
};
|
| 162 |
+
|
| 163 |
+
template<> struct helper_traits<experimental::precision::b1> {
|
| 164 |
+
typedef experimental::precision::b1 element_type;
|
| 165 |
+
typedef unsigned int storage_element_type;
|
| 166 |
+
typedef unsigned int fill_argument_type;
|
| 167 |
+
};
|
| 168 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 169 |
+
|
| 170 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 171 |
+
template<> struct helper_traits<precision::tf32> {
|
| 172 |
+
typedef precision::tf32 element_type;
|
| 173 |
+
typedef float storage_element_type;
|
| 174 |
+
typedef float fill_argument_type;
|
| 175 |
+
};
|
| 176 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 177 |
+
|
| 178 |
+
//
|
| 179 |
+
// The base fragment type
|
| 180 |
+
//
|
| 181 |
+
/* note: alignment required for compiler implementation */
|
| 182 |
+
template <typename T, int size, int packed_size = size>
|
| 183 |
+
struct __align__(8) __frag_base {
|
| 184 |
+
|
| 185 |
+
/* Number of elements in the fragment */
|
| 186 |
+
enum {num_elements = size};
|
| 187 |
+
|
| 188 |
+
/* Number of storage elements in the fragment.
|
| 189 |
+
|
| 190 |
+
The elements of the fragment are packed together when the
|
| 191 |
+
fragment element type is experimental::precision::u4,
|
| 192 |
+
experimental::precision::s4 or experimental::precision::b1.
|
| 193 |
+
When elements are packed, num_storage_elements
|
| 194 |
+
will be smaller than num_elements.
|
| 195 |
+
*/
|
| 196 |
+
enum {num_storage_elements = packed_size};
|
| 197 |
+
|
| 198 |
+
/* element type of the fragment */
|
| 199 |
+
typedef T element_type;
|
| 200 |
+
|
| 201 |
+
/* element type of the storage representation.
|
| 202 |
+
|
| 203 |
+
The mapping from element_type to storage_element_type is as follows:
|
| 204 |
+
experimental::precision::u4 -> unsigned (8 elements in 1 storage element)
|
| 205 |
+
experimental::precision::s4 -> int (8 elements in 1 storage element)
|
| 206 |
+
experimental::precision::b1 -> unsigned (32 elements in 1 storage element)
|
| 207 |
+
precision::tf32 -> float (1 element in 1 storage element)
|
| 208 |
+
all other types T -> T
|
| 209 |
+
*/
|
| 210 |
+
typedef typename helper_traits<T>::storage_element_type storage_element_type;
|
| 211 |
+
|
| 212 |
+
/* Storage for the (possibly packed) fragment elements. */
|
| 213 |
+
storage_element_type x[num_storage_elements];
|
| 214 |
+
};
|
| 215 |
+
|
| 216 |
+
template <typename FragEleType, typename StorageType, typename ArgType>
|
| 217 |
+
static inline __device__ StorageType __get_storage_value(ArgType in) { return in; }
|
| 218 |
+
|
| 219 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 220 |
+
template<>
|
| 221 |
+
__device__ inline unsigned
|
| 222 |
+
__get_storage_value<experimental::precision::u4, unsigned, unsigned>(unsigned in)
|
| 223 |
+
{
|
| 224 |
+
/* For experimental::precision::u4 fragment element type, pack 8 elements into a single
|
| 225 |
+
32-bit unsigned int storage element */
|
| 226 |
+
unsigned val = in & 0xf;
|
| 227 |
+
return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
|
| 228 |
+
(val << 20) | (val << 24) | (val << 28));
|
| 229 |
+
};
|
| 230 |
+
|
| 231 |
+
template<>
|
| 232 |
+
__device__ inline int
|
| 233 |
+
__get_storage_value<experimental::precision::s4, int, int>(int in)
|
| 234 |
+
{
|
| 235 |
+
/* For experimental::precision::s4 fragment element type, pack 8 elements into a single
|
| 236 |
+
32-bit signed int storage element */
|
| 237 |
+
int val = in & 0xf;
|
| 238 |
+
return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
|
| 239 |
+
(val << 20) | (val << 24) | (val << 28));
|
| 240 |
+
};
|
| 241 |
+
|
| 242 |
+
template<>
|
| 243 |
+
__device__ inline unsigned
|
| 244 |
+
__get_storage_value<experimental::precision::b1, unsigned, unsigned>(unsigned in)
|
| 245 |
+
{
|
| 246 |
+
/* For experimental::precision::b1 fragment element type, pack 32 elements into a
|
| 247 |
+
single 32-bit unsigned int storage element */
|
| 248 |
+
return (in & 0x1) ? 0xFFFFFFFFU : 0;
|
| 249 |
+
}
|
| 250 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 251 |
+
|
| 252 |
+
template <typename FragEleType, int size, int packed_size>
|
| 253 |
+
__CUDA_MMA_DEVICE_DECL__ void fill_fragment(__frag_base<FragEleType, size, packed_size>& f,
|
| 254 |
+
/* The mapping from fragment element type (FragEleType) to fill_argument_type is:
|
| 255 |
+
experimental::precision::u4 -> unsigned (only lower 4 bits taken)
|
| 256 |
+
experimental::precision::s4 -> int (only lower 4 bits taken)
|
| 257 |
+
experimental::precision::b1 -> unsigned (only lowest 1 bit taken)
|
| 258 |
+
precision::tf32 -> float
|
| 259 |
+
all other types T -> T
|
| 260 |
+
*/
|
| 261 |
+
const typename helper_traits<FragEleType>::fill_argument_type & in) {
|
| 262 |
+
|
| 263 |
+
/* get the (possibly packed) storage element value. See the specializations above for fragment
|
| 264 |
+
element types where the storage representation is packed */
|
| 265 |
+
typedef typename helper_traits<FragEleType>::storage_element_type storage_type;
|
| 266 |
+
storage_type v = __get_storage_value<FragEleType, storage_type>(in);
|
| 267 |
+
#pragma unroll
|
| 268 |
+
for (int i=0; i< f.num_storage_elements; i++)
|
| 269 |
+
f.x[i] = v;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
//
|
| 273 |
+
// Fragment template
|
| 274 |
+
//
|
| 275 |
+
template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;
|
| 276 |
+
|
| 277 |
+
//
|
| 278 |
+
// Fragments for 16x16x16
|
| 279 |
+
//
|
| 280 |
+
template<> class fragment<matrix_a, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
|
| 281 |
+
template<> class fragment<matrix_a, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
|
| 282 |
+
template<> class fragment<matrix_b, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
|
| 283 |
+
template<> class fragment<matrix_b, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
|
| 284 |
+
template<> class fragment<accumulator, 16, 16, 16, __half> : public __frag_base<__half, 8> {};
|
| 285 |
+
template<> class fragment<accumulator, 16, 16, 16, float> : public __frag_base<float, 8> {};
|
| 286 |
+
|
| 287 |
+
#ifdef __CUDA_IMMA__
|
| 288 |
+
template<> class fragment<matrix_a, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
|
| 289 |
+
template<> class fragment<matrix_a, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
|
| 290 |
+
template<> class fragment<matrix_a, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
|
| 291 |
+
template<> class fragment<matrix_a, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
|
| 292 |
+
template<> class fragment<matrix_b, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
|
| 293 |
+
template<> class fragment<matrix_b, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
|
| 294 |
+
template<> class fragment<matrix_b, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
|
| 295 |
+
template<> class fragment<matrix_b, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
|
| 296 |
+
template<> class fragment<accumulator, 16, 16, 16, int> : public __frag_base<int, 8> {};
|
| 297 |
+
#endif /* __CUDA_IMMA__ */
|
| 298 |
+
|
| 299 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 300 |
+
template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
|
| 301 |
+
template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
|
| 302 |
+
template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
|
| 303 |
+
template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
|
| 304 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 305 |
+
|
| 306 |
+
//
|
| 307 |
+
// Fragments for 32x8x16
|
| 308 |
+
//
|
| 309 |
+
template<> class fragment<matrix_a, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
|
| 310 |
+
template<> class fragment<matrix_a, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
|
| 311 |
+
template<> class fragment<matrix_b, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
|
| 312 |
+
template<> class fragment<matrix_b, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
|
| 313 |
+
template<> class fragment<accumulator, 32, 8, 16, __half> : public __frag_base<__half, 8> {};
|
| 314 |
+
template<> class fragment<accumulator, 32, 8, 16, float> : public __frag_base<float, 8> {};
|
| 315 |
+
|
| 316 |
+
#ifdef __CUDA_IMMA__
|
| 317 |
+
template<> class fragment<matrix_a, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
|
| 318 |
+
template<> class fragment<matrix_a, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
|
| 319 |
+
template<> class fragment<matrix_a, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
|
| 320 |
+
template<> class fragment<matrix_a, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
|
| 321 |
+
template<> class fragment<matrix_b, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
|
| 322 |
+
template<> class fragment<matrix_b, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
|
| 323 |
+
template<> class fragment<matrix_b, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
|
| 324 |
+
template<> class fragment<matrix_b, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
|
| 325 |
+
template<> class fragment<accumulator, 32, 8, 16, int> : public __frag_base<int, 8> {};
|
| 326 |
+
#endif /* __CUDA_IMMA__ */
|
| 327 |
+
|
| 328 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 329 |
+
template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
|
| 330 |
+
template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
|
| 331 |
+
template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
|
| 332 |
+
template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
|
| 333 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 334 |
+
|
| 335 |
+
//
|
| 336 |
+
// Fragments for 8x32x16
|
| 337 |
+
//
|
| 338 |
+
template<> class fragment<matrix_a, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
|
| 339 |
+
template<> class fragment<matrix_a, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
|
| 340 |
+
template<> class fragment<matrix_b, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
|
| 341 |
+
template<> class fragment<matrix_b, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
|
| 342 |
+
template<> class fragment<accumulator, 8, 32, 16, __half> : public __frag_base<__half, 8> {};
|
| 343 |
+
template<> class fragment<accumulator, 8, 32, 16, float> : public __frag_base<float, 8> {};
|
| 344 |
+
|
| 345 |
+
#ifdef __CUDA_IMMA__
|
| 346 |
+
template<> class fragment<matrix_a, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
|
| 347 |
+
template<> class fragment<matrix_a, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
|
| 348 |
+
template<> class fragment<matrix_a, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
|
| 349 |
+
template<> class fragment<matrix_a, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
|
| 350 |
+
template<> class fragment<matrix_b, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
|
| 351 |
+
template<> class fragment<matrix_b, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
|
| 352 |
+
template<> class fragment<matrix_b, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
|
| 353 |
+
template<> class fragment<matrix_b, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
|
| 354 |
+
template<> class fragment<accumulator, 8, 32, 16, int> : public __frag_base<int, 8> {};
|
| 355 |
+
#endif /* __CUDA_IMMA__ */
|
| 356 |
+
|
| 357 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 358 |
+
template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
|
| 359 |
+
template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
|
| 360 |
+
template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
|
| 361 |
+
template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
|
| 362 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 363 |
+
|
| 364 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 365 |
+
//
|
| 366 |
+
// Fragments for 8x8x32
|
| 367 |
+
//
|
| 368 |
+
template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
|
| 369 |
+
template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
|
| 370 |
+
template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
|
| 371 |
+
template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
|
| 372 |
+
template<> class fragment<accumulator, 8, 8, 32, int> : public __frag_base<int, 2> {};
|
| 373 |
+
|
| 374 |
+
//
|
| 375 |
+
// Fragments for 8x8x128
|
| 376 |
+
//
|
| 377 |
+
template<> class fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
|
| 378 |
+
template<> class fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
|
| 379 |
+
template<> class fragment<accumulator, 8, 8, 128, int> : public __frag_base<int, 2> {};
|
| 380 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 381 |
+
|
| 382 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 383 |
+
//
|
| 384 |
+
// Fragments for 16x16x8
|
| 385 |
+
//
|
| 386 |
+
template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
|
| 387 |
+
template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
|
| 388 |
+
template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
|
| 389 |
+
template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
|
| 390 |
+
template<> class fragment<accumulator, 16, 16, 8, float> : public __frag_base<float, 8> {};
|
| 391 |
+
|
| 392 |
+
//
|
| 393 |
+
// Fragments for 8x8x4
|
| 394 |
+
//
|
| 395 |
+
template<> class fragment<matrix_a, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
|
| 396 |
+
template<> class fragment<matrix_a, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
|
| 397 |
+
template<> class fragment<matrix_b, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
|
| 398 |
+
template<> class fragment<matrix_b, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
|
| 399 |
+
template<> class fragment<accumulator, 8, 8, 4, double> : public __frag_base<double, 2> {};
|
| 400 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
//
|
| 404 |
+
// Load functions for frags of shape m16n16k16
|
| 405 |
+
//
|
| 406 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 407 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 408 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 409 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 410 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 411 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 412 |
+
|
| 413 |
+
#ifdef __CUDA_IMMA__
|
| 414 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 415 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 416 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 417 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 418 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 419 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 420 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 421 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 422 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 423 |
+
#endif /* __CUDA_IMMA__ */
|
| 424 |
+
|
| 425 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 426 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 427 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 428 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 429 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 430 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 431 |
+
|
| 432 |
+
//
|
| 433 |
+
// Load functions for frags of shape m32n8k16
|
| 434 |
+
//
|
| 435 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 436 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 437 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 438 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 439 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 440 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 441 |
+
|
| 442 |
+
#ifdef __CUDA_IMMA__
|
| 443 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 444 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 445 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 446 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 447 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 448 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 449 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 450 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 451 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 452 |
+
#endif /* __CUDA_IMMA__ */
|
| 453 |
+
|
| 454 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 455 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 456 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 457 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 458 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 459 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 460 |
+
|
| 461 |
+
//
|
| 462 |
+
// Load functions for frags of shape m8n32k16
|
| 463 |
+
//
|
| 464 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 465 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 466 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 467 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
|
| 468 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 469 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 470 |
+
|
| 471 |
+
#ifdef __CUDA_IMMA__
|
| 472 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 473 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 474 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 475 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 476 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 477 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
|
| 478 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 479 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
|
| 480 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 481 |
+
#endif /* __CUDA_IMMA__ */
|
| 482 |
+
|
| 483 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 484 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 485 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 486 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 487 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
|
| 488 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 489 |
+
|
| 490 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 491 |
+
//
|
| 492 |
+
// Load functions for frags of shape m8n8k32
|
| 493 |
+
//
|
| 494 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
|
| 495 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
|
| 496 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
|
| 497 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
|
| 498 |
+
|
| 499 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 500 |
+
|
| 501 |
+
//
|
| 502 |
+
// Load functions for frags of shape m8n8k128
|
| 503 |
+
//
|
| 504 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
|
| 505 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
|
| 506 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 507 |
+
|
| 508 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 512 |
+
//
|
| 513 |
+
// Load functions for frags of shape m16n16k8
|
| 514 |
+
//
|
| 515 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
|
| 516 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
|
| 517 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
|
| 518 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
|
| 519 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 520 |
+
|
| 521 |
+
//
|
| 522 |
+
// Load functions for frags of shape m8n8k4
|
| 523 |
+
//
|
| 524 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
|
| 525 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
|
| 526 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
|
| 527 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
|
| 528 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 529 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 530 |
+
|
| 531 |
+
//
|
| 532 |
+
// Store functions for frags of shape m16n16k16
|
| 533 |
+
//
|
| 534 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 16, 16, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 535 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 536 |
+
#ifdef __CUDA_IMMA__
|
| 537 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 16, 16, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 538 |
+
#endif /* __CUDA_IMMA__ */
|
| 539 |
+
|
| 540 |
+
//
|
| 541 |
+
// Store functions for frags of shape m32n8k16
|
| 542 |
+
//
|
| 543 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 544 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 545 |
+
#ifdef __CUDA_IMMA__
|
| 546 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 547 |
+
#endif /* __CUDA_IMMA__ */
|
| 548 |
+
|
| 549 |
+
//
|
| 550 |
+
// Store functions for frags of shape m8n32k16
|
| 551 |
+
//
|
| 552 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 553 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 554 |
+
#ifdef __CUDA_IMMA__
|
| 555 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 556 |
+
#endif /* __CUDA_IMMA__ */
|
| 557 |
+
|
| 558 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 559 |
+
//
|
| 560 |
+
// Store functions for frags of shape m8n8k32
|
| 561 |
+
//
|
| 562 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 563 |
+
|
| 564 |
+
//
|
| 565 |
+
// Store functions for frags of shape m8n8k128
|
| 566 |
+
//
|
| 567 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 568 |
+
|
| 569 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 570 |
+
|
| 571 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 572 |
+
//
|
| 573 |
+
// Store functions for frags of shape m16n16k8
|
| 574 |
+
//
|
| 575 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 576 |
+
|
| 577 |
+
//
|
| 578 |
+
// Store functions for frags of shape m8n8k4
|
| 579 |
+
//
|
| 580 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
|
| 581 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 582 |
+
|
| 583 |
+
//
|
| 584 |
+
// MMA functions for shape m16n16k16
|
| 585 |
+
//
|
| 586 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 587 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 588 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 589 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 590 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 591 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 592 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 593 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
|
| 594 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 595 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 596 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 597 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 598 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 599 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 600 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 601 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 602 |
+
|
| 603 |
+
#ifdef __CUDA_IMMA__
|
| 604 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 605 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 606 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 607 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 608 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 609 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 610 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 611 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 612 |
+
#endif /* __CUDA_IMMA__ */
|
| 613 |
+
|
| 614 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 615 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 616 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 617 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 618 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
|
| 619 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 620 |
+
|
| 621 |
+
//
|
| 622 |
+
// MMA functions for shape m32n8k16
|
| 623 |
+
//
|
| 624 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 625 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 626 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 627 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 628 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 629 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 630 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 631 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
|
| 632 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 633 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 634 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 635 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 636 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 637 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 638 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 639 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 640 |
+
|
| 641 |
+
#ifdef __CUDA_IMMA__
|
| 642 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 643 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 644 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 645 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 646 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 647 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 648 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 649 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 650 |
+
#endif /* __CUDA_IMMA__ */
|
| 651 |
+
|
| 652 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 653 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 654 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 655 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 656 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
|
| 657 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 658 |
+
|
| 659 |
+
//
|
| 660 |
+
// MMA functions for shape m8n32k16
|
| 661 |
+
//
|
| 662 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 663 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 664 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 665 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 666 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 667 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 668 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 669 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
|
| 670 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 671 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 672 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 673 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 674 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 675 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 676 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 677 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 678 |
+
|
| 679 |
+
#ifdef __CUDA_IMMA__
|
| 680 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 681 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 682 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 683 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 684 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 685 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 686 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 687 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
|
| 688 |
+
#endif /* __CUDA_IMMA__ */
|
| 689 |
+
|
| 690 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 691 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 692 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 693 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 694 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
|
| 695 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 696 |
+
|
| 697 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 698 |
+
//
|
| 699 |
+
// MMA functions for shape m8n8k32
|
| 700 |
+
//
|
| 701 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
|
| 702 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
//
|
| 706 |
+
// MMA functions for shape m8n8k128
|
| 707 |
+
//
|
| 708 |
+
__CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
|
| 709 |
+
experimental::bmmaBitOp = experimental::bmmaBitOpXOR,
|
| 710 |
+
experimental::bmmaAccumulateOp = experimental::bmmaAccumulateOpPOPC) __DEF_IF_HOST
|
| 711 |
+
|
| 712 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 713 |
+
|
| 714 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 715 |
+
//
|
| 716 |
+
// MMA functions for shape m16n16k8
|
| 717 |
+
//
|
| 718 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
|
| 719 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
|
| 720 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
|
| 721 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
|
| 722 |
+
|
| 723 |
+
//
|
| 724 |
+
// MMA functions for shape m8n8k4
|
| 725 |
+
//
|
| 726 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
|
| 727 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
|
| 728 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
|
| 729 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
|
| 730 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 731 |
+
};
|
| 732 |
+
};
|
| 733 |
+
|
| 734 |
+
#undef __DEF_IF_HOST
|
| 735 |
+
#undef __CUDA_IMMA__
|
| 736 |
+
#undef __CUDA_SUBBYTE_IMMA__
|
| 737 |
+
#undef __CUDA_AMPERE_MMA__
|
| 738 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
|
| 739 |
+
|
| 740 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 741 |
+
|
| 742 |
+
#undef __CUDA_MMA_DEVICE_DECL__
|
| 743 |
+
|
| 744 |
+
#if defined(__CUDA_ARCH__)
|
| 745 |
+
#include "mma.hpp"
|
| 746 |
+
#endif /* defined(__CUDA_ARCH__) */
|
| 747 |
+
|
| 748 |
+
|
| 749 |
+
#endif /* !__CUDA_MMA_H__ */
|
| 750 |
+
|
| 751 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__)
|
| 752 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 753 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
|
| 754 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.hpp
ADDED
|
@@ -0,0 +1,1128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2020 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/mma.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/mma.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__CUDA_MMA_HPP__)
|
| 61 |
+
#define __CUDA_MMA_HPP__
|
| 62 |
+
|
| 63 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 64 |
+
|
| 65 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
| 66 |
+
|
| 67 |
+
#include <cuda_fp16.h>
|
| 68 |
+
#include <cuda_bf16.h>
|
| 69 |
+
|
| 70 |
+
#define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
|
| 71 |
+
|
| 72 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
|
| 73 |
+
#define __CUDA_IMMA__ 1
|
| 74 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
|
| 75 |
+
|
| 76 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
|
| 77 |
+
#define __CUDA_SUBBYTE_IMMA__ 1
|
| 78 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
|
| 79 |
+
|
| 80 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
|
| 81 |
+
#define __CUDA_AMPERE_MMA__ 1
|
| 82 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
|
| 83 |
+
|
| 84 |
+
namespace nvcuda {
|
| 85 |
+
namespace wmma {
|
| 86 |
+
|
| 87 |
+
//
|
| 88 |
+
// Load functions for frags of shape m16n16k16
|
| 89 |
+
//
|
| 90 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
|
| 91 |
+
__hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
|
| 95 |
+
__hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
|
| 99 |
+
__hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
|
| 103 |
+
__hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
|
| 107 |
+
if (layout == mem_row_major)
|
| 108 |
+
__hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
|
| 109 |
+
else
|
| 110 |
+
__hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
|
| 114 |
+
if (layout == mem_row_major)
|
| 115 |
+
__hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
|
| 116 |
+
else
|
| 117 |
+
__hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
#ifdef __CUDA_IMMA__
|
| 121 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
|
| 122 |
+
__imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
|
| 126 |
+
__imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
|
| 130 |
+
__imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
|
| 134 |
+
__imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
|
| 138 |
+
__imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
|
| 142 |
+
__imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
|
| 146 |
+
__imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
|
| 150 |
+
__imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
|
| 154 |
+
if (layout == mem_row_major)
|
| 155 |
+
__imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 0);
|
| 156 |
+
else
|
| 157 |
+
__imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 1);
|
| 158 |
+
}
|
| 159 |
+
#endif /* __CUDA_IMMA__ */
|
| 160 |
+
|
| 161 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 162 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 163 |
+
__mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 167 |
+
__mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 171 |
+
__mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 175 |
+
__mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
|
| 176 |
+
}
|
| 177 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
//
|
| 181 |
+
// Load functions for frags of shape m32n8k16
|
| 182 |
+
//
|
| 183 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
|
| 184 |
+
__hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
|
| 188 |
+
__hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
|
| 192 |
+
__hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
|
| 196 |
+
__hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
|
| 200 |
+
if (layout == mem_row_major)
|
| 201 |
+
__hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
|
| 202 |
+
else
|
| 203 |
+
__hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
|
| 207 |
+
if (layout == mem_row_major)
|
| 208 |
+
__hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
|
| 209 |
+
else
|
| 210 |
+
__hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
#ifdef __CUDA_IMMA__
|
| 214 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
|
| 215 |
+
__imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
|
| 219 |
+
__imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
|
| 223 |
+
__imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
|
| 227 |
+
__imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
|
| 228 |
+
}
|
| 229 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
|
| 230 |
+
__imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
|
| 234 |
+
__imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
|
| 238 |
+
__imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
|
| 242 |
+
__imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
|
| 246 |
+
if (layout == mem_row_major)
|
| 247 |
+
__imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 0);
|
| 248 |
+
else
|
| 249 |
+
__imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 1);
|
| 250 |
+
}
|
| 251 |
+
#endif /* __CUDA_IMMA__ */
|
| 252 |
+
|
| 253 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 254 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 255 |
+
__mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 259 |
+
__mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 263 |
+
__mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 267 |
+
__mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
|
| 268 |
+
}
|
| 269 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
//
|
| 273 |
+
// Load functions for frags of shape m8n32k16
|
| 274 |
+
//
|
| 275 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
|
| 276 |
+
__hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
|
| 280 |
+
__hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
|
| 284 |
+
__hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
|
| 288 |
+
__hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
|
| 292 |
+
if (layout == mem_row_major)
|
| 293 |
+
__hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
|
| 294 |
+
else
|
| 295 |
+
__hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
|
| 299 |
+
if (layout == mem_row_major)
|
| 300 |
+
__hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
|
| 301 |
+
else
|
| 302 |
+
__hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
#ifdef __CUDA_IMMA__
|
| 306 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
|
| 307 |
+
__imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
|
| 311 |
+
__imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
|
| 315 |
+
__imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
|
| 319 |
+
__imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
|
| 320 |
+
}
|
| 321 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
|
| 322 |
+
__imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
|
| 326 |
+
__imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
|
| 330 |
+
__imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
|
| 334 |
+
__imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
|
| 338 |
+
if (layout == mem_row_major)
|
| 339 |
+
__imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 0);
|
| 340 |
+
else
|
| 341 |
+
__imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 1);
|
| 342 |
+
}
|
| 343 |
+
#endif /* __CUDA_IMMA__ */
|
| 344 |
+
|
| 345 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 346 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 347 |
+
__mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 351 |
+
__mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 355 |
+
__mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
|
| 359 |
+
__mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
|
| 360 |
+
}
|
| 361 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 365 |
+
//
|
| 366 |
+
// Load functions for frags of shape m8n8k32
|
| 367 |
+
//
|
| 368 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) {
|
| 369 |
+
__imma_m8n8k32_ld_a_s4((int *)&a, (const int *)p, ldm, 0);
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) {
|
| 373 |
+
__imma_m8n8k32_ld_a_u4((int *)&a, (const int *)p, ldm, 0);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) {
|
| 377 |
+
__imma_m8n8k32_ld_b_s4((int *)&a, (const int *)p, ldm, 1);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) {
|
| 381 |
+
__imma_m8n8k32_ld_b_u4((int *)&a, (const int *)p, ldm, 1);
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) {
|
| 385 |
+
if (layout == mem_row_major)
|
| 386 |
+
__imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 0);
|
| 387 |
+
else
|
| 388 |
+
__imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 1);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
//
|
| 392 |
+
// Load functions for frags of shape m8n8k128
|
| 393 |
+
//
|
| 394 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) {
|
| 395 |
+
__bmma_m8n8k128_ld_a_b1((int *)&a, (const int *)p, ldm, 0);
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) {
|
| 399 |
+
__bmma_m8n8k128_ld_b_b1((int *)&a, (const int *)p, ldm, 1);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) {
|
| 403 |
+
if (layout == mem_row_major)
|
| 404 |
+
__bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 0);
|
| 405 |
+
else
|
| 406 |
+
__bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 1);
|
| 407 |
+
}
|
| 408 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 413 |
+
// load functions for frags of shape m16n16k8
|
| 414 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
|
| 415 |
+
__mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 0);
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
|
| 419 |
+
__mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 1);
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
|
| 423 |
+
__mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 0);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
|
| 427 |
+
__mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 1);
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) {
|
| 431 |
+
if (layout == mem_row_major)
|
| 432 |
+
__mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 0);
|
| 433 |
+
else
|
| 434 |
+
__mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 1);
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
// load functions for frags of shape m8n8k4
|
| 438 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
|
| 439 |
+
__dmma_m8n8k4_ld_a((double *)&a, p, ldm, 0);
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
|
| 443 |
+
__dmma_m8n8k4_ld_a((double *)&a, p, ldm, 1);
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
|
| 447 |
+
__dmma_m8n8k4_ld_b((double *)&a, p, ldm, 0);
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
|
| 451 |
+
__dmma_m8n8k4_ld_b((double *)&a, p, ldm, 1);
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
__CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) {
|
| 455 |
+
if (layout == mem_row_major)
|
| 456 |
+
__dmma_m8n8k4_ld_c((double *)&a, p, ldm, 0);
|
| 457 |
+
else
|
| 458 |
+
__dmma_m8n8k4_ld_c((double *)&a, p, ldm, 1);
|
| 459 |
+
}
|
| 460 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 461 |
+
|
| 462 |
+
//
|
| 463 |
+
// Store functions for frags of shape m16n16k16
|
| 464 |
+
//
|
| 465 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator,16, 16, 16, __half>& a, unsigned ldm, layout_t layout) {
|
| 466 |
+
if (layout == mem_row_major)
|
| 467 |
+
__hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
|
| 468 |
+
else
|
| 469 |
+
__hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator,16, 16, 16, float>& a, unsigned ldm, layout_t layout) {
|
| 473 |
+
if (layout == mem_row_major)
|
| 474 |
+
__hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
|
| 475 |
+
else
|
| 476 |
+
__hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
#ifdef __CUDA_IMMA__
|
| 480 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator,16, 16, 16, int>& a, unsigned ldm, layout_t layout) {
|
| 481 |
+
if (layout == mem_row_major)
|
| 482 |
+
__imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 0);
|
| 483 |
+
else
|
| 484 |
+
__imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 1);
|
| 485 |
+
}
|
| 486 |
+
#endif /* __CUDA_IMMA__ */
|
| 487 |
+
|
| 488 |
+
//
|
| 489 |
+
// Store functions for frags of shape m32n8k16
|
| 490 |
+
//
|
| 491 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) {
|
| 492 |
+
if (layout == mem_row_major)
|
| 493 |
+
__hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
|
| 494 |
+
else
|
| 495 |
+
__hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) {
|
| 499 |
+
if (layout == mem_row_major)
|
| 500 |
+
__hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
|
| 501 |
+
else
|
| 502 |
+
__hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
#ifdef __CUDA_IMMA__
|
| 506 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) {
|
| 507 |
+
if (layout == mem_row_major)
|
| 508 |
+
__imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 0);
|
| 509 |
+
else
|
| 510 |
+
__imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 1);
|
| 511 |
+
}
|
| 512 |
+
#endif /* __CUDA_IMMA__ */
|
| 513 |
+
|
| 514 |
+
//
|
| 515 |
+
// Store functions for frags of shape m8n32k16
|
| 516 |
+
//
|
| 517 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) {
|
| 518 |
+
if (layout == mem_row_major)
|
| 519 |
+
__hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
|
| 520 |
+
else
|
| 521 |
+
__hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) {
|
| 525 |
+
if (layout == mem_row_major)
|
| 526 |
+
__hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
|
| 527 |
+
else
|
| 528 |
+
__hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
#ifdef __CUDA_IMMA__
|
| 532 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) {
|
| 533 |
+
if (layout == mem_row_major)
|
| 534 |
+
__imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 0);
|
| 535 |
+
else
|
| 536 |
+
__imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 1);
|
| 537 |
+
}
|
| 538 |
+
#endif /* __CUDA_IMMA__ */
|
| 539 |
+
|
| 540 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 541 |
+
//
|
| 542 |
+
// Store functions for frags of shape m8n8k32
|
| 543 |
+
//
|
| 544 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) {
|
| 545 |
+
if (layout == mem_row_major)
|
| 546 |
+
__imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 0);
|
| 547 |
+
else
|
| 548 |
+
__imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 1);
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
//
|
| 552 |
+
// Store functions for frags of shape m8n8k128
|
| 553 |
+
//
|
| 554 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) {
|
| 555 |
+
if (layout == mem_row_major)
|
| 556 |
+
__bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 0);
|
| 557 |
+
else
|
| 558 |
+
__bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 1);
|
| 559 |
+
}
|
| 560 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 564 |
+
|
| 565 |
+
//
|
| 566 |
+
// Store functions for frags of shape m16n16k8
|
| 567 |
+
//
|
| 568 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) {
|
| 569 |
+
if (layout == mem_row_major)
|
| 570 |
+
__mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 0);
|
| 571 |
+
else
|
| 572 |
+
__mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 1);
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
//
|
| 577 |
+
// Store functions for frags of shape m8n8k4
|
| 578 |
+
//
|
| 579 |
+
__CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) {
|
| 580 |
+
if (layout == mem_row_major)
|
| 581 |
+
__dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 0);
|
| 582 |
+
else
|
| 583 |
+
__dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 1);
|
| 584 |
+
}
|
| 585 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 586 |
+
|
| 587 |
+
//
|
| 588 |
+
// MMA functions for shape m16n16k16
|
| 589 |
+
//
|
| 590 |
+
// D fp16, C fp16
|
| 591 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 592 |
+
__hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 596 |
+
__hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 600 |
+
__hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 604 |
+
__hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
// D fp32, C fp16
|
| 608 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 609 |
+
__hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 613 |
+
__hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
|
| 614 |
+
}
|
| 615 |
+
|
| 616 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 617 |
+
__hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
|
| 621 |
+
__hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
// D fp32, C fp32
|
| 625 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 626 |
+
__hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 630 |
+
__hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 634 |
+
__hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 638 |
+
__hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
// D fp16, C fp32
|
| 642 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 643 |
+
__hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 647 |
+
__hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 651 |
+
__hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 655 |
+
__hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
#ifdef __CUDA_IMMA__
|
| 659 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 660 |
+
if (satf)
|
| 661 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
|
| 662 |
+
else
|
| 663 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
|
| 664 |
+
}
|
| 665 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 666 |
+
if (satf)
|
| 667 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
|
| 668 |
+
else
|
| 669 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 673 |
+
if (satf)
|
| 674 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
|
| 675 |
+
else
|
| 676 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
|
| 677 |
+
}
|
| 678 |
+
|
| 679 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 680 |
+
if (satf)
|
| 681 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
|
| 682 |
+
else
|
| 683 |
+
__imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 687 |
+
if (satf)
|
| 688 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
|
| 689 |
+
else
|
| 690 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
|
| 691 |
+
}
|
| 692 |
+
|
| 693 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 694 |
+
if (satf)
|
| 695 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
|
| 696 |
+
else
|
| 697 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 701 |
+
if (satf)
|
| 702 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
|
| 703 |
+
else
|
| 704 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
|
| 708 |
+
if (satf)
|
| 709 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
|
| 710 |
+
else
|
| 711 |
+
__imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
|
| 712 |
+
}
|
| 713 |
+
#endif /* __CUDA_IMMA__ */
|
| 714 |
+
|
| 715 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 716 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 717 |
+
__mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 718 |
+
}
|
| 719 |
+
|
| 720 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 721 |
+
__mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 722 |
+
}
|
| 723 |
+
|
| 724 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 725 |
+
__mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 726 |
+
}
|
| 727 |
+
|
| 728 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
|
| 729 |
+
__mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 730 |
+
}
|
| 731 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 732 |
+
|
| 733 |
+
|
| 734 |
+
//
|
| 735 |
+
// MMA functions for shape m32n8k16
|
| 736 |
+
//
|
| 737 |
+
// D fp16, C fp16
|
| 738 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 739 |
+
__hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 743 |
+
__hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 747 |
+
__hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
|
| 748 |
+
}
|
| 749 |
+
|
| 750 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 751 |
+
__hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
|
| 752 |
+
}
|
| 753 |
+
|
| 754 |
+
// D fp32, C fp16
|
| 755 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 756 |
+
__hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 760 |
+
__hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 764 |
+
__hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
|
| 765 |
+
}
|
| 766 |
+
|
| 767 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
|
| 768 |
+
__hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
|
| 769 |
+
}
|
| 770 |
+
|
| 771 |
+
// D fp32, C fp32
|
| 772 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 773 |
+
__hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 777 |
+
__hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 778 |
+
}
|
| 779 |
+
|
| 780 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 781 |
+
__hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 785 |
+
__hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 786 |
+
}
|
| 787 |
+
|
| 788 |
+
// D fp16, C fp32
|
| 789 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 790 |
+
__hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 791 |
+
}
|
| 792 |
+
|
| 793 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 794 |
+
__hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 795 |
+
}
|
| 796 |
+
|
| 797 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 798 |
+
__hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
|
| 802 |
+
__hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 803 |
+
}
|
| 804 |
+
|
| 805 |
+
#ifdef __CUDA_IMMA__
|
| 806 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 807 |
+
if (satf)
|
| 808 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
|
| 809 |
+
else
|
| 810 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
|
| 811 |
+
}
|
| 812 |
+
|
| 813 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 814 |
+
if (satf)
|
| 815 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
|
| 816 |
+
else
|
| 817 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 821 |
+
if (satf)
|
| 822 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
|
| 823 |
+
else
|
| 824 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
|
| 825 |
+
}
|
| 826 |
+
|
| 827 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 828 |
+
if (satf)
|
| 829 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
|
| 830 |
+
else
|
| 831 |
+
__imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
|
| 832 |
+
}
|
| 833 |
+
|
| 834 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 835 |
+
if (satf)
|
| 836 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
|
| 837 |
+
else
|
| 838 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
|
| 839 |
+
}
|
| 840 |
+
|
| 841 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 842 |
+
if (satf)
|
| 843 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
|
| 844 |
+
else
|
| 845 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
|
| 846 |
+
|
| 847 |
+
}
|
| 848 |
+
|
| 849 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 850 |
+
if (satf)
|
| 851 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
|
| 852 |
+
else
|
| 853 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
|
| 854 |
+
|
| 855 |
+
}
|
| 856 |
+
|
| 857 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
|
| 858 |
+
if (satf)
|
| 859 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
|
| 860 |
+
else
|
| 861 |
+
__imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
|
| 862 |
+
|
| 863 |
+
}
|
| 864 |
+
#endif /* __CUDA_IMMA__ */
|
| 865 |
+
|
| 866 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 867 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
|
| 868 |
+
__mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 869 |
+
}
|
| 870 |
+
|
| 871 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
|
| 872 |
+
__mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 873 |
+
}
|
| 874 |
+
|
| 875 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
|
| 876 |
+
__mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 877 |
+
}
|
| 878 |
+
|
| 879 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
|
| 880 |
+
__mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 881 |
+
}
|
| 882 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 883 |
+
|
| 884 |
+
//
|
| 885 |
+
// MMA functions for shape m8n32k16
|
| 886 |
+
//
|
| 887 |
+
// D fp16, C fp16
|
| 888 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 889 |
+
__hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
|
| 890 |
+
}
|
| 891 |
+
|
| 892 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 893 |
+
__hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
|
| 894 |
+
}
|
| 895 |
+
|
| 896 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 897 |
+
__hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
|
| 898 |
+
}
|
| 899 |
+
|
| 900 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 901 |
+
__hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
|
| 902 |
+
}
|
| 903 |
+
|
| 904 |
+
// D fp32, C fp16
|
| 905 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 906 |
+
__hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
|
| 907 |
+
}
|
| 908 |
+
|
| 909 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 910 |
+
__hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
|
| 911 |
+
}
|
| 912 |
+
|
| 913 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 914 |
+
__hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
|
| 915 |
+
}
|
| 916 |
+
|
| 917 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
|
| 918 |
+
__hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
|
| 919 |
+
}
|
| 920 |
+
|
| 921 |
+
// D fp32, C fp32
|
| 922 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 923 |
+
__hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 924 |
+
}
|
| 925 |
+
|
| 926 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 927 |
+
__hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 928 |
+
}
|
| 929 |
+
|
| 930 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 931 |
+
__hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 932 |
+
}
|
| 933 |
+
|
| 934 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 935 |
+
__hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 936 |
+
}
|
| 937 |
+
|
| 938 |
+
// D fp16, C fp32
|
| 939 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 940 |
+
__hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 941 |
+
}
|
| 942 |
+
|
| 943 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 944 |
+
__hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 945 |
+
}
|
| 946 |
+
|
| 947 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 948 |
+
__hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 949 |
+
}
|
| 950 |
+
|
| 951 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
|
| 952 |
+
__hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 953 |
+
}
|
| 954 |
+
|
| 955 |
+
#ifdef __CUDA_IMMA__
|
| 956 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 957 |
+
if (satf)
|
| 958 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
|
| 959 |
+
else
|
| 960 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
|
| 961 |
+
}
|
| 962 |
+
|
| 963 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 964 |
+
if (satf)
|
| 965 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
|
| 966 |
+
else
|
| 967 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
|
| 968 |
+
}
|
| 969 |
+
|
| 970 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 971 |
+
if (satf)
|
| 972 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
|
| 973 |
+
else
|
| 974 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
|
| 975 |
+
}
|
| 976 |
+
|
| 977 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 978 |
+
if (satf)
|
| 979 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
|
| 980 |
+
else
|
| 981 |
+
__imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
|
| 982 |
+
}
|
| 983 |
+
|
| 984 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 985 |
+
if (satf)
|
| 986 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
|
| 987 |
+
else
|
| 988 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
|
| 989 |
+
}
|
| 990 |
+
|
| 991 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 992 |
+
if (satf)
|
| 993 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
|
| 994 |
+
else
|
| 995 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
|
| 996 |
+
}
|
| 997 |
+
|
| 998 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 999 |
+
if (satf)
|
| 1000 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
|
| 1001 |
+
else
|
| 1002 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
|
| 1003 |
+
}
|
| 1004 |
+
|
| 1005 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
|
| 1006 |
+
if (satf)
|
| 1007 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
|
| 1008 |
+
else
|
| 1009 |
+
__imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
|
| 1010 |
+
}
|
| 1011 |
+
#endif /* __CUDA_IMMA__ */
|
| 1012 |
+
|
| 1013 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 1014 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
|
| 1015 |
+
__mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 1016 |
+
}
|
| 1017 |
+
|
| 1018 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
|
| 1019 |
+
__mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 1020 |
+
}
|
| 1021 |
+
|
| 1022 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
|
| 1023 |
+
__mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 1024 |
+
}
|
| 1025 |
+
|
| 1026 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
|
| 1027 |
+
__mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 1028 |
+
}
|
| 1029 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 1030 |
+
|
| 1031 |
+
|
| 1032 |
+
#ifdef __CUDA_SUBBYTE_IMMA__
|
| 1033 |
+
//
|
| 1034 |
+
// MMA functions for shape m8n8k32
|
| 1035 |
+
//
|
| 1036 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
|
| 1037 |
+
if (satf)
|
| 1038 |
+
__imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
|
| 1039 |
+
else
|
| 1040 |
+
__imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
|
| 1041 |
+
}
|
| 1042 |
+
|
| 1043 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
|
| 1044 |
+
if (satf)
|
| 1045 |
+
__imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
|
| 1046 |
+
else
|
| 1047 |
+
__imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
|
| 1048 |
+
}
|
| 1049 |
+
|
| 1050 |
+
//
|
| 1051 |
+
// MMA functions for shape m8n8k128
|
| 1052 |
+
//
|
| 1053 |
+
__CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
|
| 1054 |
+
experimental::bmmaBitOp op, experimental::bmmaAccumulateOp)
|
| 1055 |
+
{
|
| 1056 |
+
|
| 1057 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 1058 |
+
if (op == experimental::bmmaBitOpAND)
|
| 1059 |
+
__bmma_m8n8k128_mma_and_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
|
| 1060 |
+
else
|
| 1061 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 1062 |
+
__bmma_m8n8k128_mma_xor_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
|
| 1063 |
+
}
|
| 1064 |
+
|
| 1065 |
+
|
| 1066 |
+
#endif /* __CUDA_SUBBYTE_IMMA__ */
|
| 1067 |
+
|
| 1068 |
+
#ifdef __CUDA_AMPERE_MMA__
|
| 1069 |
+
//
|
| 1070 |
+
// MMA functions for shape m16n16k8
|
| 1071 |
+
//
|
| 1072 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
|
| 1073 |
+
__mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
|
| 1074 |
+
}
|
| 1075 |
+
|
| 1076 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
|
| 1077 |
+
__mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
|
| 1078 |
+
}
|
| 1079 |
+
|
| 1080 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
|
| 1081 |
+
__mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
|
| 1082 |
+
}
|
| 1083 |
+
|
| 1084 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
|
| 1085 |
+
__mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
|
| 1086 |
+
}
|
| 1087 |
+
|
| 1088 |
+
|
| 1089 |
+
//
|
| 1090 |
+
// MMA functions for shape m8n8k4
|
| 1091 |
+
//
|
| 1092 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
|
| 1093 |
+
__dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 1, 0);
|
| 1094 |
+
}
|
| 1095 |
+
|
| 1096 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
|
| 1097 |
+
__dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 3, 0);
|
| 1098 |
+
}
|
| 1099 |
+
|
| 1100 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
|
| 1101 |
+
__dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 0, 0);
|
| 1102 |
+
}
|
| 1103 |
+
|
| 1104 |
+
__CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
|
| 1105 |
+
__dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 2, 0);
|
| 1106 |
+
}
|
| 1107 |
+
|
| 1108 |
+
#endif /* __CUDA_AMPERE_MMA__ */
|
| 1109 |
+
|
| 1110 |
+
};
|
| 1111 |
+
};
|
| 1112 |
+
|
| 1113 |
+
#undef __CUDA_IMMA__
|
| 1114 |
+
#undef __CUDA_SUBBYTE_IMMA__
|
| 1115 |
+
#undef __CUDA_MMA_DEVICE_DECL__
|
| 1116 |
+
#undef __CUDA_AMPERE_MMA__
|
| 1117 |
+
|
| 1118 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
|
| 1119 |
+
|
| 1120 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 1121 |
+
|
| 1122 |
+
|
| 1123 |
+
#endif /* __CUDA_MMA_HPP__ */
|
| 1124 |
+
|
| 1125 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__)
|
| 1126 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 1127 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
|
| 1128 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/nvfunctional
ADDED
|
@@ -0,0 +1,621 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* NVIDIA_COPYRIGHT_BEGIN
|
| 3 |
+
*
|
| 4 |
+
* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
+
*
|
| 6 |
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 7 |
+
* and proprietary rights in and to this software, related documentation
|
| 8 |
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
| 9 |
+
* distribution of this software and related documentation without an express
|
| 10 |
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 11 |
+
*
|
| 12 |
+
* NVIDIA_COPYRIGHT_END
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 16 |
+
#if defined(_MSC_VER)
|
| 17 |
+
#pragma message("crt/nvfunctional is an internal header file and must not be used directly. Please use nvfunctional instead.")
|
| 18 |
+
#else
|
| 19 |
+
#warning "crt/nvfunctional is an internal header file and must not be used directly. Please use nvfunctional instead."
|
| 20 |
+
#endif
|
| 21 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 22 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#ifndef __NV_LIBCXX_FUNCTIONAL_H__
|
| 26 |
+
#define __NV_LIBCXX_FUNCTIONAL_H__
|
| 27 |
+
|
| 28 |
+
#if __cplusplus < 201103L
|
| 29 |
+
#if defined(_MSC_VER)
|
| 30 |
+
#if _MSC_VER < 1800
|
| 31 |
+
#error This library requires VS 2013 and above
|
| 32 |
+
#endif /* _MSC_VER < 1800 */
|
| 33 |
+
#else /* !_MSC_VER */
|
| 34 |
+
#error This library requires support for the ISO C++ 2011 standard
|
| 35 |
+
#endif /* _MSC_VER */
|
| 36 |
+
#endif /* __cplusplus */
|
| 37 |
+
|
| 38 |
+
#if defined(_MSC_VER)
|
| 39 |
+
#define __NV_ALIGNOF __alignof
|
| 40 |
+
#define __NV_NOEXCEPT
|
| 41 |
+
#define __NV_CONSTEXPR
|
| 42 |
+
#else /* !_MSC_VER */
|
| 43 |
+
#define __NV_ALIGNOF alignof
|
| 44 |
+
#define __NV_NOEXCEPT noexcept
|
| 45 |
+
#define __NV_CONSTEXPR constexpr
|
| 46 |
+
#endif /* _MSC_VER */
|
| 47 |
+
|
| 48 |
+
#include <type_traits>
|
| 49 |
+
#include <cstddef>
|
| 50 |
+
#include <new>
|
| 51 |
+
|
| 52 |
+
// n3290 20.8
|
| 53 |
+
namespace nvstd
|
| 54 |
+
{
|
| 55 |
+
|
| 56 |
+
namespace internal {
|
| 57 |
+
|
| 58 |
+
// D.8.1 base (deprecated) [depr.base]
|
| 59 |
+
template <class _Arg, class _Result>
|
| 60 |
+
struct unary_function
|
| 61 |
+
{
|
| 62 |
+
typedef _Arg argument_type;
|
| 63 |
+
typedef _Result result_type;
|
| 64 |
+
};
|
| 65 |
+
|
| 66 |
+
template <class _Arg1, class _Arg2, class _Result>
|
| 67 |
+
struct binary_function
|
| 68 |
+
{
|
| 69 |
+
typedef _Arg1 first_argument_type;
|
| 70 |
+
typedef _Arg2 second_argument_type;
|
| 71 |
+
typedef _Result result_type;
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
// move
|
| 75 |
+
template <class _T>
|
| 76 |
+
inline __device__ __host__
|
| 77 |
+
typename std::remove_reference<_T>::type&& move(_T&& __t) __NV_NOEXCEPT
|
| 78 |
+
{
|
| 79 |
+
return static_cast<typename std::remove_reference<_T>::type&&>(__t);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// 20.2.2 swap [utility.swap]
|
| 83 |
+
// swap
|
| 84 |
+
template<class _T,
|
| 85 |
+
class = typename std::enable_if<
|
| 86 |
+
std::is_move_constructible<_T>::value &&
|
| 87 |
+
std::is_move_assignable<_T>::value>::type>
|
| 88 |
+
inline __device__ __host__
|
| 89 |
+
void swap(_T& __a, _T& __b)
|
| 90 |
+
#if !defined(_MSC_VER)
|
| 91 |
+
noexcept(std::is_nothrow_move_constructible<_T>::value &&
|
| 92 |
+
std::is_nothrow_move_assignable<_T>::value)
|
| 93 |
+
#endif /* !defined(_MSC_VER) */
|
| 94 |
+
{
|
| 95 |
+
_T __t(internal::move(__a));
|
| 96 |
+
__a = internal::move(__b);
|
| 97 |
+
__b = internal::move(__t);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
// 20.2.3 forward/move helpers [forward]
|
| 101 |
+
// forward
|
| 102 |
+
template <class _T>
|
| 103 |
+
inline __device__ __host__
|
| 104 |
+
_T&& forward(typename std::remove_reference<_T>::type& __t) __NV_NOEXCEPT
|
| 105 |
+
{
|
| 106 |
+
return static_cast<_T&&>(__t);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
template <class _T>
|
| 110 |
+
inline __device__ __host__
|
| 111 |
+
_T&& forward(typename std::remove_reference<_T>::type&& __t) __NV_NOEXCEPT
|
| 112 |
+
{
|
| 113 |
+
static_assert(!std::is_lvalue_reference<_T>::value,
|
| 114 |
+
"Error: __t is instantiated with an lvalue reference type");
|
| 115 |
+
return static_cast<_T&&>(__t);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
} // namespace internal
|
| 119 |
+
|
| 120 |
+
namespace __functional_helpers
|
| 121 |
+
{
|
| 122 |
+
|
| 123 |
+
struct __dummy_class;
|
| 124 |
+
|
| 125 |
+
// Store small functors locally:
|
| 126 |
+
// a functor is legitimate to local storage if it is one of the following types:
|
| 127 |
+
// * member object pointer;
|
| 128 |
+
// * member function pointer;
|
| 129 |
+
// * closure type of size less than or equal to the largest size of
|
| 130 |
+
// the above types;
|
| 131 |
+
// * function pointer;
|
| 132 |
+
// * any callable class whose size is less than or equal to
|
| 133 |
+
// the largest one of the above types;
|
| 134 |
+
union _Small_functor_types
|
| 135 |
+
{
|
| 136 |
+
void *__obj;
|
| 137 |
+
void (*__func_ptr)();
|
| 138 |
+
void (__dummy_class::*mem_fn_ptr)();
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
struct _Small_functor_data {
|
| 142 |
+
char __data[sizeof(_Small_functor_types)];
|
| 143 |
+
};
|
| 144 |
+
|
| 145 |
+
template <class _RetType, class ..._ArgTypes>
|
| 146 |
+
struct __maybe_base_function
|
| 147 |
+
{ };
|
| 148 |
+
|
| 149 |
+
template <class _RetType, class _T1>
|
| 150 |
+
struct __maybe_base_function<_RetType(_T1)>
|
| 151 |
+
: public internal::unary_function<_T1, _RetType>
|
| 152 |
+
{ };
|
| 153 |
+
|
| 154 |
+
template <class _RetType, class _T1, class _T2>
|
| 155 |
+
struct __maybe_base_function<_RetType(_T1, _T2)>
|
| 156 |
+
: public internal::binary_function<_T1, _T2, _RetType>
|
| 157 |
+
{ };
|
| 158 |
+
|
| 159 |
+
} // namespace __functional_helpers
|
| 160 |
+
|
| 161 |
+
// 20.8.11 Polymorphic function wrappers [func.wrap]
|
| 162 |
+
|
| 163 |
+
// 20.8.11.1 Class bad_function_call [func.wrap.badcall]
|
| 164 |
+
// unimplemented because of exception
|
| 165 |
+
// class bad_function_call : public std::exception
|
| 166 |
+
|
| 167 |
+
// 20.8.11.2 Class template function [func.wrap.func]
|
| 168 |
+
|
| 169 |
+
template<class> class function; // undefined
|
| 170 |
+
|
| 171 |
+
// Simplified version of template class function, which
|
| 172 |
+
// * does not support allocator_arg_t;
|
| 173 |
+
// * does not support target and target_type that rely on RTTI
|
| 174 |
+
// * does not throw bad_function_call exception on invoking a NULL target
|
| 175 |
+
template <class _RetType, class ..._ArgTypes>
|
| 176 |
+
class function<_RetType(_ArgTypes...)>
|
| 177 |
+
: public __functional_helpers::__maybe_base_function<_RetType(_ArgTypes...)>
|
| 178 |
+
{
|
| 179 |
+
__functional_helpers::_Small_functor_data __small_functor_data;
|
| 180 |
+
void *__obj;
|
| 181 |
+
typedef _RetType(*__meta_fn_type)(void *, _ArgTypes...);
|
| 182 |
+
__meta_fn_type __meta_fn;
|
| 183 |
+
typedef void(*__cloner_type)(function &, const function &);
|
| 184 |
+
__cloner_type __cloner;
|
| 185 |
+
typedef void(*__destructor_type)(function *);
|
| 186 |
+
__destructor_type __destructor;
|
| 187 |
+
|
| 188 |
+
#pragma nv_exec_check_disable
|
| 189 |
+
template <class _F>
|
| 190 |
+
__device__ __host__
|
| 191 |
+
__NV_CONSTEXPR bool __use_small_functor_data() const
|
| 192 |
+
{
|
| 193 |
+
return (sizeof(_F) <= sizeof(__small_functor_data) &&
|
| 194 |
+
__NV_ALIGNOF(_F) <= __NV_ALIGNOF(
|
| 195 |
+
__functional_helpers::_Small_functor_types));
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
#pragma nv_exec_check_disable
|
| 199 |
+
__device__ __host__
|
| 200 |
+
void* __get_small_functor_data() const
|
| 201 |
+
{
|
| 202 |
+
return (void*)(&__small_functor_data.__data[0]);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
#pragma nv_exec_check_disable
|
| 206 |
+
__device__ __host__
|
| 207 |
+
bool __is_small_functor_data() const
|
| 208 |
+
{
|
| 209 |
+
return __obj == __get_small_functor_data();
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
#pragma nv_exec_check_disable
|
| 213 |
+
template <class _F>
|
| 214 |
+
__device__ __host__
|
| 215 |
+
static _F& __get_functor(void *__p)
|
| 216 |
+
{
|
| 217 |
+
return *((_F*)__p);
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
#pragma nv_exec_check_disable
|
| 221 |
+
template <class _F>
|
| 222 |
+
__device__ __host__
|
| 223 |
+
static bool __is_empty_functor(const _F& /*__p*/)
|
| 224 |
+
{
|
| 225 |
+
return false;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
#pragma nv_exec_check_disable
|
| 229 |
+
template <class _F>
|
| 230 |
+
__device__ __host__
|
| 231 |
+
static bool __is_empty_functor(const _F* __p)
|
| 232 |
+
{
|
| 233 |
+
return !__p;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
#pragma nv_exec_check_disable
|
| 237 |
+
template <class _Res, class _C>
|
| 238 |
+
__device__ __host__
|
| 239 |
+
static bool __is_empty_functor(const _Res _C::* __p)
|
| 240 |
+
{
|
| 241 |
+
return !__p;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
#pragma nv_exec_check_disable
|
| 245 |
+
template <class _Res, class... _Args>
|
| 246 |
+
__device__ __host__
|
| 247 |
+
static bool __is_empty_functor(const function<_Res(_Args...)>& __p)
|
| 248 |
+
{
|
| 249 |
+
return !__p;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
template <class _F>
|
| 253 |
+
struct __make_cloner
|
| 254 |
+
{
|
| 255 |
+
#pragma nv_exec_check_disable
|
| 256 |
+
__device__ __host__
|
| 257 |
+
static void __clone_data(function &__dest, const function &__src)
|
| 258 |
+
{
|
| 259 |
+
if (__dest.__use_small_functor_data<_F>()) {
|
| 260 |
+
__dest.__obj = __dest.__get_small_functor_data();
|
| 261 |
+
new (__dest.__obj) _F(__src.__get_functor<_F>(__src.__obj));
|
| 262 |
+
}
|
| 263 |
+
else {
|
| 264 |
+
__dest.__obj = new _F(__src.__get_functor<_F>(__src.__obj));
|
| 265 |
+
}
|
| 266 |
+
}
|
| 267 |
+
};
|
| 268 |
+
|
| 269 |
+
template <class _F>
|
| 270 |
+
struct __make_destructor
|
| 271 |
+
{
|
| 272 |
+
#pragma nv_exec_check_disable
|
| 273 |
+
__device__ __host__
|
| 274 |
+
static void __destruct(function *__fn)
|
| 275 |
+
{
|
| 276 |
+
if (__fn->__use_small_functor_data<_F>()) {
|
| 277 |
+
(__fn->__get_functor<_F>(__fn->__obj)).~_F();
|
| 278 |
+
}
|
| 279 |
+
else {
|
| 280 |
+
delete (_F*)(__fn->__obj);
|
| 281 |
+
}
|
| 282 |
+
}
|
| 283 |
+
};
|
| 284 |
+
|
| 285 |
+
// We cannot simple define __make_functor in the following way:
|
| 286 |
+
// template <class _T, _F>
|
| 287 |
+
// __make_functor;
|
| 288 |
+
// template <class _RetType1, class _F, class... _ArgTypes1>
|
| 289 |
+
// struct __make_functor<_RetType1(_ArgTypes1...), _F>
|
| 290 |
+
//
|
| 291 |
+
// because VS 2013 cannot unpack _RetType1(_ArgTypes1...)
|
| 292 |
+
template <class _RetType1, class _F, class... _ArgTypes1>
|
| 293 |
+
struct __make_functor
|
| 294 |
+
{
|
| 295 |
+
typedef _RetType1 type;
|
| 296 |
+
|
| 297 |
+
#pragma nv_exec_check_disable
|
| 298 |
+
__device__ __host__
|
| 299 |
+
static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
|
| 300 |
+
{
|
| 301 |
+
return __get_functor<_F>(__d)(
|
| 302 |
+
internal::forward<_ArgTypes1>(__args)...);
|
| 303 |
+
}
|
| 304 |
+
};
|
| 305 |
+
|
| 306 |
+
template <class _RetType1, class _C, class _M, class... _ArgTypes1>
|
| 307 |
+
struct __make_functor<_RetType1, _M _C::*,_ArgTypes1...>
|
| 308 |
+
{
|
| 309 |
+
typedef _RetType1 type;
|
| 310 |
+
typedef _RetType1(*_Fn)(_ArgTypes1...);
|
| 311 |
+
|
| 312 |
+
#pragma nv_exec_check_disable
|
| 313 |
+
__device__ __host__
|
| 314 |
+
static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
|
| 315 |
+
{
|
| 316 |
+
return __get_functor<_Fn>(__d)(
|
| 317 |
+
internal::forward<_ArgTypes1>(__args)...);
|
| 318 |
+
}
|
| 319 |
+
};
|
| 320 |
+
|
| 321 |
+
// workaround for GCC version below 4.8
|
| 322 |
+
#if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
|
| 323 |
+
template <class _F>
|
| 324 |
+
struct __check_callability
|
| 325 |
+
: public std::integral_constant<bool,
|
| 326 |
+
!std::is_same<_F, std::nullptr_t>::value>
|
| 327 |
+
{ };
|
| 328 |
+
#elif defined(_MSC_VER)
|
| 329 |
+
// simulate VC 2013's behavior...
|
| 330 |
+
template <class _F>
|
| 331 |
+
struct __check_callability1
|
| 332 |
+
: public
|
| 333 |
+
std::integral_constant<bool,
|
| 334 |
+
// std::result_of does not handle member pointers well
|
| 335 |
+
std::is_member_pointer<_F>::value ||
|
| 336 |
+
std::is_convertible<
|
| 337 |
+
_RetType,
|
| 338 |
+
typename std::result_of<_F(_ArgTypes...)>::type
|
| 339 |
+
>::value
|
| 340 |
+
>
|
| 341 |
+
{ };
|
| 342 |
+
|
| 343 |
+
template <class _F>
|
| 344 |
+
struct __check_callability
|
| 345 |
+
: public std::integral_constant<
|
| 346 |
+
bool,
|
| 347 |
+
!std::is_same<_F, function>::value &&
|
| 348 |
+
__check_callability1<typename std::remove_cv<_F>::type>::value>
|
| 349 |
+
{ };
|
| 350 |
+
#else /* !((__GNUC__ == 4) && (__GNUC_MINOR__ < 8)) _MSC_VER */
|
| 351 |
+
template <class _F,
|
| 352 |
+
class _T = typename std::result_of<_F(_ArgTypes...)>::type>
|
| 353 |
+
struct __check_callability
|
| 354 |
+
: public std::integral_constant<
|
| 355 |
+
bool,
|
| 356 |
+
!std::is_same<_F, function>::value &&
|
| 357 |
+
std::is_convertible< _T, _RetType>::value>
|
| 358 |
+
{ };
|
| 359 |
+
#endif /* __GNUC__ == 4) && (__GNUC_MINOR__ < 8) */
|
| 360 |
+
|
| 361 |
+
#pragma nv_exec_check_disable
|
| 362 |
+
__device__ __host__
|
| 363 |
+
void __destroy()
|
| 364 |
+
{
|
| 365 |
+
if (__obj) {
|
| 366 |
+
__destructor(this);
|
| 367 |
+
__obj = 0;
|
| 368 |
+
}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
#pragma nv_exec_check_disable
|
| 372 |
+
__device__ __host__
|
| 373 |
+
void __clear()
|
| 374 |
+
{
|
| 375 |
+
__obj = 0;
|
| 376 |
+
__meta_fn = 0;
|
| 377 |
+
__cloner = 0;
|
| 378 |
+
__destructor = 0;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
public:
|
| 382 |
+
typedef _RetType result_type;
|
| 383 |
+
|
| 384 |
+
/*
|
| 385 |
+
* These typedef(s) are derived from __maybe_base_function
|
| 386 |
+
* typedef T1 argument_type; // only if sizeof...(ArgTypes) == 1 and
|
| 387 |
+
* // the type in ArgTypes is T1
|
| 388 |
+
* typedef T1 first_argument_type; // only if sizeof...(ArgTypes) == 2 and
|
| 389 |
+
* // ArgTypes contains T1 and T2
|
| 390 |
+
* typedef T2 second_argument_type; // only if sizeof...(ArgTypes) == 2 and
|
| 391 |
+
* // ArgTypes contains T1 and T2
|
| 392 |
+
*/
|
| 393 |
+
|
| 394 |
+
// 20.8.11.2.1 construct/copy/destroy [func.wrap.con]
|
| 395 |
+
|
| 396 |
+
#pragma nv_exec_check_disable
|
| 397 |
+
__device__ __host__
|
| 398 |
+
function() __NV_NOEXCEPT
|
| 399 |
+
: __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
|
| 400 |
+
|
| 401 |
+
#pragma nv_exec_check_disable
|
| 402 |
+
__device__ __host__
|
| 403 |
+
function(std::nullptr_t) __NV_NOEXCEPT
|
| 404 |
+
: __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
|
| 405 |
+
|
| 406 |
+
#pragma nv_exec_check_disable
|
| 407 |
+
__device__ __host__
|
| 408 |
+
function(const function &__fn)
|
| 409 |
+
{
|
| 410 |
+
if (__fn.__obj == 0) {
|
| 411 |
+
__clear();
|
| 412 |
+
}
|
| 413 |
+
else {
|
| 414 |
+
__meta_fn = __fn.__meta_fn;
|
| 415 |
+
__destructor = __fn.__destructor;
|
| 416 |
+
__fn.__cloner(*this, __fn);
|
| 417 |
+
__cloner = __fn.__cloner;
|
| 418 |
+
}
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
#pragma nv_exec_check_disable
|
| 422 |
+
__device__ __host__
|
| 423 |
+
function(function &&__fn)
|
| 424 |
+
{
|
| 425 |
+
__fn.swap(*this);
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
// VS 2013 cannot process __check_callability type trait.
|
| 429 |
+
// So, we check callability using static_assert instead of
|
| 430 |
+
// using SFINAE such as
|
| 431 |
+
// template<class _F,
|
| 432 |
+
// class = typename std::enable_if<
|
| 433 |
+
// __check_callability<_F>::value
|
| 434 |
+
// >::type>
|
| 435 |
+
|
| 436 |
+
#pragma nv_exec_check_disable
|
| 437 |
+
template<class _F>
|
| 438 |
+
__device__ __host__
|
| 439 |
+
function(_F);
|
| 440 |
+
|
| 441 |
+
// copy and swap
|
| 442 |
+
#pragma nv_exec_check_disable
|
| 443 |
+
__device__ __host__
|
| 444 |
+
function& operator=(const function& __fn)
|
| 445 |
+
{
|
| 446 |
+
function(__fn).swap(*this);
|
| 447 |
+
return *this;
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
#pragma nv_exec_check_disable
|
| 451 |
+
__device__ __host__
|
| 452 |
+
function& operator=(function&& __fn)
|
| 453 |
+
{
|
| 454 |
+
function(internal::move(__fn)).swap(*this);
|
| 455 |
+
return *this;
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
#pragma nv_exec_check_disable
|
| 459 |
+
__device__ __host__
|
| 460 |
+
function& operator=(std::nullptr_t)
|
| 461 |
+
{
|
| 462 |
+
__destroy();
|
| 463 |
+
return *this;
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
#pragma nv_exec_check_disable
|
| 467 |
+
template<class _F>
|
| 468 |
+
__device__ __host__
|
| 469 |
+
function&
|
| 470 |
+
operator=(_F&& __fn)
|
| 471 |
+
{
|
| 472 |
+
static_assert(__check_callability<_F>::value,
|
| 473 |
+
"Unable to create functor object!");
|
| 474 |
+
function(internal::forward<_F>(__fn)).swap(*this);
|
| 475 |
+
return *this;
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
#pragma nv_exec_check_disable
|
| 479 |
+
__device__ __host__
|
| 480 |
+
~function()
|
| 481 |
+
{
|
| 482 |
+
__destroy();
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
// 20.8.11.2.2 function modifiers [func.wrap.func.mod]
|
| 486 |
+
#pragma nv_exec_check_disable
|
| 487 |
+
__device__ __host__
|
| 488 |
+
void swap(function& __fn) __NV_NOEXCEPT
|
| 489 |
+
{
|
| 490 |
+
internal::swap(__meta_fn, __fn.__meta_fn);
|
| 491 |
+
internal::swap(__cloner, __fn.__cloner);
|
| 492 |
+
internal::swap(__destructor, __fn.__destructor);
|
| 493 |
+
|
| 494 |
+
if (__is_small_functor_data() && __fn.__is_small_functor_data()) {
|
| 495 |
+
internal::swap(__small_functor_data, __fn.__small_functor_data);
|
| 496 |
+
}
|
| 497 |
+
else if (__is_small_functor_data()) {
|
| 498 |
+
internal::swap(__small_functor_data, __fn.__small_functor_data);
|
| 499 |
+
internal::swap(__obj, __fn.__obj);
|
| 500 |
+
__fn.__obj = __fn.__get_small_functor_data();
|
| 501 |
+
}
|
| 502 |
+
else if (__fn.__is_small_functor_data()) {
|
| 503 |
+
internal::swap(__small_functor_data, __fn.__small_functor_data);
|
| 504 |
+
internal::swap(__obj, __fn.__obj);
|
| 505 |
+
__obj = __get_small_functor_data();
|
| 506 |
+
}
|
| 507 |
+
else {
|
| 508 |
+
internal::swap(__obj, __fn.__obj);
|
| 509 |
+
}
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
// 20.8.11.2.3 function capacity [func.wrap.func.cap]
|
| 513 |
+
#pragma nv_exec_check_disable
|
| 514 |
+
__device__ __host__
|
| 515 |
+
explicit operator bool() const __NV_NOEXCEPT
|
| 516 |
+
{
|
| 517 |
+
return __obj;
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
// 20.8.11.2.4 function invocation [func.wrap.func.inv]
|
| 521 |
+
// function::operator() can only be called in device code
|
| 522 |
+
// to avoid cross-execution space calls
|
| 523 |
+
#pragma nv_exec_check_disable
|
| 524 |
+
__device__ __host__
|
| 525 |
+
_RetType operator()(_ArgTypes...) const;
|
| 526 |
+
|
| 527 |
+
};
|
| 528 |
+
|
| 529 |
+
// Out-of-line definitions
|
| 530 |
+
#pragma nv_exec_check_disable
|
| 531 |
+
template<class _RetType, class... _ArgTypes>
|
| 532 |
+
template<class _F>
|
| 533 |
+
__device__ __host__
|
| 534 |
+
function<_RetType(_ArgTypes...)>::function(_F __fn)
|
| 535 |
+
: __obj(0), __meta_fn(0), __cloner(0), __destructor(0)
|
| 536 |
+
{
|
| 537 |
+
static_assert(__check_callability<_F>::value,
|
| 538 |
+
"Unable to construct functor object!");
|
| 539 |
+
if (__is_empty_functor(__fn))
|
| 540 |
+
return;
|
| 541 |
+
__meta_fn = &__make_functor<_RetType, _F, _ArgTypes...>::__invoke;
|
| 542 |
+
__cloner = &__make_cloner<_F>::__clone_data;
|
| 543 |
+
__destructor = &__make_destructor<_F>::__destruct;
|
| 544 |
+
|
| 545 |
+
if (__use_small_functor_data<_F>()) {
|
| 546 |
+
__obj = __get_small_functor_data();
|
| 547 |
+
new ((void*)__obj) _F(internal::move(__fn));
|
| 548 |
+
}
|
| 549 |
+
else {
|
| 550 |
+
__obj = new _F(internal::move(__fn));
|
| 551 |
+
}
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
#pragma nv_exec_check_disable
|
| 555 |
+
template <class _RetType, class..._ArgTypes>
|
| 556 |
+
__device__ __host__
|
| 557 |
+
_RetType
|
| 558 |
+
function<_RetType(_ArgTypes...)>::operator()(_ArgTypes... __args) const
|
| 559 |
+
{
|
| 560 |
+
return __meta_fn(__obj, internal::forward<_ArgTypes>(__args)...);
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
// 20.8.11.2.6, Null pointer comparisons:
|
| 564 |
+
|
| 565 |
+
#pragma nv_exec_check_disable
|
| 566 |
+
template <class _R, class... _ArgTypes>
|
| 567 |
+
__device__ __host__
|
| 568 |
+
bool operator==(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
|
| 569 |
+
__NV_NOEXCEPT
|
| 570 |
+
{
|
| 571 |
+
return !__fn;
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
#pragma nv_exec_check_disable
|
| 575 |
+
template <class _R, class... _ArgTypes>
|
| 576 |
+
__device__ __host__
|
| 577 |
+
bool operator==(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
|
| 578 |
+
__NV_NOEXCEPT
|
| 579 |
+
{
|
| 580 |
+
return !__fn;
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
#pragma nv_exec_check_disable
|
| 584 |
+
template <class _R, class... _ArgTypes>
|
| 585 |
+
__device__ __host__
|
| 586 |
+
bool operator!=(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
|
| 587 |
+
__NV_NOEXCEPT
|
| 588 |
+
{
|
| 589 |
+
return static_cast<bool>(__fn);
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
#pragma nv_exec_check_disable
|
| 593 |
+
template <class _R, class... _ArgTypes>
|
| 594 |
+
__device__ __host__
|
| 595 |
+
bool operator!=(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
|
| 596 |
+
__NV_NOEXCEPT
|
| 597 |
+
{
|
| 598 |
+
return static_cast<bool>(__fn);
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
// 20.8.11.2.7, specialized algorithms:
|
| 602 |
+
#pragma nv_exec_check_disable
|
| 603 |
+
template <class _R, class... _ArgTypes>
|
| 604 |
+
__device__ __host__
|
| 605 |
+
void swap(function<_R(_ArgTypes...)>& __fn1, function<_R(_ArgTypes...)>& __fn2)
|
| 606 |
+
{
|
| 607 |
+
__fn1.swap(__fn2);
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
} // namespace nvstd
|
| 611 |
+
|
| 612 |
+
#undef __NV_NOEXCEPT
|
| 613 |
+
#undef __NV_CONSTEXPR
|
| 614 |
+
#undef __NV_ALIGNOF
|
| 615 |
+
|
| 616 |
+
#endif // __NV_LIBCXX_FUNCTIONAL_H__
|
| 617 |
+
|
| 618 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__)
|
| 619 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 620 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
|
| 621 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
|
| 51 |
+
#define EXCLUDE_FROM_RTC
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 54 |
+
#if defined(_MSC_VER)
|
| 55 |
+
#pragma message("crt/sm_70_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 56 |
+
#else
|
| 57 |
+
#warning "crt/sm_70_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 58 |
+
#endif
|
| 59 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 60 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
|
| 61 |
+
#endif
|
| 62 |
+
|
| 63 |
+
#if !defined(__SM_70_RT_H__)
|
| 64 |
+
#define __SM_70_RT_H__
|
| 65 |
+
|
| 66 |
+
#if defined(__CUDACC_RTC__)
|
| 67 |
+
#define __SM_70_RT_DECL__ __host__ __device__
|
| 68 |
+
#elif defined(_NVHPC_CUDA)
|
| 69 |
+
#define __SM_70_RT_DECL__ extern __device__ __cudart_builtin__
|
| 70 |
+
#else /* !__CUDACC_RTC__ */
|
| 71 |
+
#define __SM_70_RT_DECL__ static __device__ __inline__
|
| 72 |
+
#endif /* __CUDACC_RTC__ */
|
| 73 |
+
|
| 74 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 75 |
+
|
| 76 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
| 77 |
+
|
| 78 |
+
/*******************************************************************************
|
| 79 |
+
* *
|
| 80 |
+
* *
|
| 81 |
+
* *
|
| 82 |
+
*******************************************************************************/
|
| 83 |
+
|
| 84 |
+
#include "builtin_types.h"
|
| 85 |
+
#include "device_types.h"
|
| 86 |
+
#include "host_defines.h"
|
| 87 |
+
|
| 88 |
+
#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
|
| 89 |
+
#define __DEF_IF_HOST { }
|
| 90 |
+
#else /* !__CUDA_ARCH__ */
|
| 91 |
+
#define __DEF_IF_HOST ;
|
| 92 |
+
#endif /* __CUDA_ARCH__ */
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
/******************************************************************************
|
| 96 |
+
* match *
|
| 97 |
+
******************************************************************************/
|
| 98 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) __DEF_IF_HOST
|
| 99 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) __DEF_IF_HOST
|
| 100 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) __DEF_IF_HOST
|
| 101 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) __DEF_IF_HOST
|
| 102 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) __DEF_IF_HOST
|
| 103 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) __DEF_IF_HOST
|
| 104 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) __DEF_IF_HOST
|
| 105 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) __DEF_IF_HOST
|
| 106 |
+
|
| 107 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) __DEF_IF_HOST
|
| 108 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) __DEF_IF_HOST
|
| 109 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) __DEF_IF_HOST
|
| 110 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) __DEF_IF_HOST
|
| 111 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) __DEF_IF_HOST
|
| 112 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) __DEF_IF_HOST
|
| 113 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) __DEF_IF_HOST
|
| 114 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) __DEF_IF_HOST
|
| 115 |
+
|
| 116 |
+
__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) __DEF_IF_HOST
|
| 117 |
+
|
| 118 |
+
__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) __DEF_IF_HOST
|
| 119 |
+
|
| 120 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
|
| 121 |
+
|
| 122 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 123 |
+
|
| 124 |
+
#undef __DEF_IF_HOST
|
| 125 |
+
#undef __SM_70_RT_DECL__
|
| 126 |
+
|
| 127 |
+
#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
|
| 128 |
+
#include "sm_70_rt.hpp"
|
| 129 |
+
#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
|
| 130 |
+
|
| 131 |
+
#endif /* !__SM_70_RT_H__ */
|
| 132 |
+
|
| 133 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__)
|
| 134 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 135 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
|
| 136 |
+
#endif
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
#undef EXCLUDE_FROM_RTC
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/sm_70_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/sm_70_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__SM_70_RT_HPP__)
|
| 61 |
+
#define __SM_70_RT_HPP__
|
| 62 |
+
|
| 63 |
+
#if defined(__CUDACC_RTC__)
|
| 64 |
+
#define __SM_70_RT_DECL__ __host__ __device__
|
| 65 |
+
#else /* !__CUDACC_RTC__ */
|
| 66 |
+
#define __SM_70_RT_DECL__ static __device__ __inline__
|
| 67 |
+
#endif /* __CUDACC_RTC__ */
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
| 72 |
+
|
| 73 |
+
/*******************************************************************************
|
| 74 |
+
* *
|
| 75 |
+
* *
|
| 76 |
+
* *
|
| 77 |
+
*******************************************************************************/
|
| 78 |
+
|
| 79 |
+
#include "builtin_types.h"
|
| 80 |
+
#include "device_types.h"
|
| 81 |
+
#include "host_defines.h"
|
| 82 |
+
|
| 83 |
+
/*******************************************************************************
|
| 84 |
+
* *
|
| 85 |
+
* Below are implementations of SM-7.0 builtin functions which are included as *
|
| 86 |
+
* source (instead of being built in to the compiler) *
|
| 87 |
+
* *
|
| 88 |
+
*******************************************************************************/
|
| 89 |
+
|
| 90 |
+
//
|
| 91 |
+
// __match_any_sync
|
| 92 |
+
//
|
| 93 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) {
|
| 94 |
+
return __match32_any_sync(mask, value);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) {
|
| 98 |
+
return __match32_any_sync(mask, value);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) {
|
| 102 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 103 |
+
__match64_any_sync(mask, (unsigned long long)value):
|
| 104 |
+
__match32_any_sync(mask, (unsigned)value);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) {
|
| 108 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 109 |
+
__match64_any_sync(mask, (unsigned long long)value):
|
| 110 |
+
__match32_any_sync(mask, (unsigned)value);
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) {
|
| 114 |
+
return __match64_any_sync(mask, value);
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) {
|
| 118 |
+
return __match64_any_sync(mask, value);
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) {
|
| 122 |
+
return __match32_any_sync(mask, __float_as_uint(value));
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) {
|
| 126 |
+
return __match64_any_sync(mask, __double_as_longlong(value));
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
//
|
| 130 |
+
// __match_all_sync
|
| 131 |
+
//
|
| 132 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) {
|
| 133 |
+
return __match32_all_sync(mask, value, pred);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) {
|
| 137 |
+
return __match32_all_sync(mask, value, pred);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) {
|
| 141 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 142 |
+
__match64_all_sync(mask, (unsigned long long)value, pred):
|
| 143 |
+
__match32_all_sync(mask, (unsigned)value, pred);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) {
|
| 147 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 148 |
+
__match64_all_sync(mask, (unsigned long long)value, pred):
|
| 149 |
+
__match32_all_sync(mask, (unsigned)value, pred);
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) {
|
| 153 |
+
return __match64_all_sync(mask, value, pred);
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) {
|
| 157 |
+
return __match64_all_sync(mask, value, pred);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) {
|
| 161 |
+
return __match32_all_sync(mask, __float_as_uint(value), pred);
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) {
|
| 165 |
+
return __match64_all_sync(mask, __double_as_longlong(value), pred);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) {
|
| 169 |
+
asm volatile("nanosleep.u32 %0;" :: "r"(ns));
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
extern "C" __device__ __device_builtin__
|
| 174 |
+
unsigned short __usAtomicCAS(unsigned short *, unsigned short, unsigned short);
|
| 175 |
+
|
| 176 |
+
__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) {
|
| 177 |
+
return __usAtomicCAS(address, compare, val);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
|
| 182 |
+
|
| 183 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 184 |
+
|
| 185 |
+
#undef __SM_70_RT_DECL__
|
| 186 |
+
|
| 187 |
+
#endif /* !__SM_70_RT_HPP__ */
|
| 188 |
+
|
| 189 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__)
|
| 190 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 191 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
|
| 192 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/sm_80_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/sm_80_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__SM_80_RT_H__)
|
| 61 |
+
#define __SM_80_RT_H__
|
| 62 |
+
|
| 63 |
+
#if defined(__CUDACC_RTC__)
|
| 64 |
+
#define __SM_80_RT_DECL__ __host__ __device__
|
| 65 |
+
#elif defined(_NVHPC_CUDA)
|
| 66 |
+
#define __SM_80_RT_DECL__ extern __device__ __cudart_builtin__
|
| 67 |
+
#else /* !__CUDACC_RTC__ */
|
| 68 |
+
#define __SM_80_RT_DECL__ static __device__ __inline__
|
| 69 |
+
#endif /* __CUDACC_RTC__ */
|
| 70 |
+
|
| 71 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 72 |
+
|
| 73 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
|
| 74 |
+
|
| 75 |
+
/*******************************************************************************
|
| 76 |
+
* *
|
| 77 |
+
* *
|
| 78 |
+
* *
|
| 79 |
+
*******************************************************************************/
|
| 80 |
+
|
| 81 |
+
#include "builtin_types.h"
|
| 82 |
+
#include "device_types.h"
|
| 83 |
+
#include "host_defines.h"
|
| 84 |
+
|
| 85 |
+
#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
|
| 86 |
+
#define __DEF_IF_HOST { }
|
| 87 |
+
#else /* !__CUDA_ARCH__ */
|
| 88 |
+
#define __DEF_IF_HOST ;
|
| 89 |
+
#endif /* __CUDA_ARCH__ */
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
|
| 93 |
+
#define EXCLUDE_FROM_RTC
|
| 94 |
+
/******************************************************************************
|
| 95 |
+
* reduce *
|
| 96 |
+
******************************************************************************/
|
| 97 |
+
__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) __DEF_IF_HOST
|
| 98 |
+
__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) __DEF_IF_HOST
|
| 99 |
+
__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) __DEF_IF_HOST
|
| 100 |
+
|
| 101 |
+
__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) __DEF_IF_HOST
|
| 102 |
+
__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) __DEF_IF_HOST
|
| 103 |
+
__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) __DEF_IF_HOST
|
| 104 |
+
|
| 105 |
+
__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) __DEF_IF_HOST
|
| 106 |
+
__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) __DEF_IF_HOST
|
| 107 |
+
__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) __DEF_IF_HOST
|
| 108 |
+
|
| 109 |
+
#undef EXCLUDE_FROM_RTC
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
extern "C" {
|
| 113 |
+
inline __device__ void *__nv_associate_access_property(const void *ptr,
|
| 114 |
+
unsigned long long property) {
|
| 115 |
+
extern __device__ void *__nv_associate_access_property_impl(const void *,
|
| 116 |
+
unsigned long long);
|
| 117 |
+
return __nv_associate_access_property_impl(ptr, property);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
inline __device__ void __nv_memcpy_async_shared_global_4(void *dst,
|
| 121 |
+
const void *src,
|
| 122 |
+
unsigned src_size) {
|
| 123 |
+
extern __device__ void __nv_memcpy_async_shared_global_4_impl(void *,
|
| 124 |
+
const void *,
|
| 125 |
+
unsigned);
|
| 126 |
+
__nv_memcpy_async_shared_global_4_impl(dst, src, src_size);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
inline __device__ void __nv_memcpy_async_shared_global_8(void *dst,
|
| 130 |
+
const void *src,
|
| 131 |
+
unsigned src_size) {
|
| 132 |
+
extern __device__ void __nv_memcpy_async_shared_global_8_impl(void *,
|
| 133 |
+
const void *,
|
| 134 |
+
unsigned);
|
| 135 |
+
__nv_memcpy_async_shared_global_8_impl(dst, src, src_size);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
inline __device__ void __nv_memcpy_async_shared_global_16(void *dst,
|
| 139 |
+
const void *src,
|
| 140 |
+
unsigned src_size) {
|
| 141 |
+
extern __device__ void __nv_memcpy_async_shared_global_16_impl(void *,
|
| 142 |
+
const void *,
|
| 143 |
+
unsigned);
|
| 144 |
+
__nv_memcpy_async_shared_global_16_impl(dst, src, src_size);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
}
|
| 148 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
|
| 149 |
+
|
| 150 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 151 |
+
|
| 152 |
+
#undef __DEF_IF_HOST
|
| 153 |
+
#undef __SM_80_RT_DECL__
|
| 154 |
+
|
| 155 |
+
#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
|
| 156 |
+
#include "sm_80_rt.hpp"
|
| 157 |
+
#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
|
| 158 |
+
|
| 159 |
+
#endif /* !__SM_80_RT_H__ */
|
| 160 |
+
|
| 161 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__)
|
| 162 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 163 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
|
| 164 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/sm_80_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/sm_80_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__SM_80_RT_HPP__)
|
| 61 |
+
#define __SM_80_RT_HPP__
|
| 62 |
+
|
| 63 |
+
#if defined(__CUDACC_RTC__)
|
| 64 |
+
#define __SM_80_RT_DECL__ __host__ __device__
|
| 65 |
+
#else /* !__CUDACC_RTC__ */
|
| 66 |
+
#define __SM_80_RT_DECL__ static __device__ __inline__
|
| 67 |
+
#endif /* __CUDACC_RTC__ */
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
|
| 72 |
+
|
| 73 |
+
/*******************************************************************************
|
| 74 |
+
* *
|
| 75 |
+
* *
|
| 76 |
+
* *
|
| 77 |
+
*******************************************************************************/
|
| 78 |
+
|
| 79 |
+
#include "builtin_types.h"
|
| 80 |
+
#include "device_types.h"
|
| 81 |
+
#include "host_defines.h"
|
| 82 |
+
|
| 83 |
+
/*******************************************************************************
|
| 84 |
+
* *
|
| 85 |
+
* Below are implementations of SM-8.0 builtin functions which are included as *
|
| 86 |
+
* source (instead of being built in to the compiler) *
|
| 87 |
+
* *
|
| 88 |
+
*******************************************************************************/
|
| 89 |
+
|
| 90 |
+
extern "C" {
|
| 91 |
+
__device_builtin__ __device__ unsigned __reduce_add_sync_unsigned_impl(unsigned, unsigned);
|
| 92 |
+
__device_builtin__ __device__ unsigned __reduce_min_sync_unsigned_impl(unsigned, unsigned);
|
| 93 |
+
__device_builtin__ __device__ unsigned __reduce_max_sync_unsigned_impl(unsigned, unsigned);
|
| 94 |
+
__device_builtin__ __device__ int __reduce_add_sync_signed_impl(unsigned, int);
|
| 95 |
+
__device_builtin__ __device__ int __reduce_min_sync_signed_impl(unsigned, int);
|
| 96 |
+
__device_builtin__ __device__ int __reduce_max_sync_signed_impl(unsigned, int);
|
| 97 |
+
__device_builtin__ __device__ unsigned __reduce_or_sync_unsigned_impl(unsigned, unsigned);
|
| 98 |
+
__device_builtin__ __device__ unsigned __reduce_and_sync_unsigned_impl(unsigned, unsigned);
|
| 99 |
+
__device_builtin__ __device__ unsigned __reduce_xor_sync_unsigned_impl(unsigned, unsigned);
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
__SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) {
|
| 103 |
+
return __reduce_add_sync_unsigned_impl(mask, value);
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
__SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) {
|
| 107 |
+
return __reduce_min_sync_unsigned_impl(mask, value);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
__SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) {
|
| 111 |
+
return __reduce_max_sync_unsigned_impl(mask, value);
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
__SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) {
|
| 115 |
+
return __reduce_add_sync_signed_impl(mask, value);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
__SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) {
|
| 119 |
+
return __reduce_min_sync_signed_impl(mask, value);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
__SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) {
|
| 123 |
+
return __reduce_max_sync_signed_impl(mask, value);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
__SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) {
|
| 127 |
+
return __reduce_and_sync_unsigned_impl(mask, value);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
__SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) {
|
| 131 |
+
return __reduce_or_sync_unsigned_impl(mask, value);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
__SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) {
|
| 135 |
+
return __reduce_xor_sync_unsigned_impl(mask, value);
|
| 136 |
+
}
|
| 137 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
|
| 138 |
+
|
| 139 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 140 |
+
|
| 141 |
+
#undef __SM_80_RT_DECL__
|
| 142 |
+
|
| 143 |
+
#endif /* !__SM_80_RT_HPP__ */
|
| 144 |
+
|
| 145 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__)
|
| 146 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 147 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
|
| 148 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2022-2023 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/sm_90_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/sm_90_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__SM_90_RT_H__)
|
| 61 |
+
#define __SM_90_RT_H__
|
| 62 |
+
|
| 63 |
+
#if defined(__CUDACC_RTC__)
|
| 64 |
+
#define __SM_90_RT_DECL__ __host__ __device__
|
| 65 |
+
#else /* !__CUDACC_RTC__ */
|
| 66 |
+
#define __SM_90_RT_DECL__ static __device__ __inline__
|
| 67 |
+
#endif /* __CUDACC_RTC__ */
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
|
| 72 |
+
|
| 73 |
+
/*******************************************************************************
|
| 74 |
+
* *
|
| 75 |
+
* *
|
| 76 |
+
* *
|
| 77 |
+
*******************************************************************************/
|
| 78 |
+
|
| 79 |
+
#include "builtin_types.h"
|
| 80 |
+
#include "device_types.h"
|
| 81 |
+
#include "host_defines.h"
|
| 82 |
+
|
| 83 |
+
#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
|
| 84 |
+
#define __DEF_IF_HOST { }
|
| 85 |
+
#else /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
|
| 86 |
+
#define __DEF_IF_HOST ;
|
| 87 |
+
#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
|
| 88 |
+
|
| 89 |
+
//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
|
| 90 |
+
#define EXCLUDE_FROM_RTC
|
| 91 |
+
|
| 92 |
+
__SM_90_RT_DECL__ unsigned __isCtaShared(const void *ptr) __DEF_IF_HOST
|
| 93 |
+
__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) __DEF_IF_HOST
|
| 94 |
+
__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, unsigned target_block_rank) __DEF_IF_HOST
|
| 95 |
+
__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr) __DEF_IF_HOST
|
| 96 |
+
__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, unsigned cluster_cta_mask) __DEF_IF_HOST
|
| 97 |
+
__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified() __DEF_IF_HOST
|
| 98 |
+
__SM_90_RT_DECL__ dim3 __clusterDim() __DEF_IF_HOST
|
| 99 |
+
__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx() __DEF_IF_HOST
|
| 100 |
+
__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters() __DEF_IF_HOST
|
| 101 |
+
__SM_90_RT_DECL__ dim3 __clusterIdx() __DEF_IF_HOST
|
| 102 |
+
__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank() __DEF_IF_HOST
|
| 103 |
+
__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks() __DEF_IF_HOST
|
| 104 |
+
__SM_90_RT_DECL__ void __cluster_barrier_arrive() __DEF_IF_HOST
|
| 105 |
+
__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed() __DEF_IF_HOST
|
| 106 |
+
__SM_90_RT_DECL__ void __cluster_barrier_wait() __DEF_IF_HOST
|
| 107 |
+
__SM_90_RT_DECL__ void __threadfence_cluster() __DEF_IF_HOST
|
| 108 |
+
|
| 109 |
+
__SM_90_RT_DECL__ float2 atomicAdd(float2 *__address, float2 val) __DEF_IF_HOST
|
| 110 |
+
__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *__address, float2 val) __DEF_IF_HOST
|
| 111 |
+
__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *__address, float2 val) __DEF_IF_HOST
|
| 112 |
+
__SM_90_RT_DECL__ float4 atomicAdd(float4 *__address, float4 val) __DEF_IF_HOST
|
| 113 |
+
__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *__address, float4 val) __DEF_IF_HOST
|
| 114 |
+
__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *__address, float4 val) __DEF_IF_HOST
|
| 115 |
+
|
| 116 |
+
#undef EXCLUDE_FROM_RTC
|
| 117 |
+
|
| 118 |
+
//Note: below atomic functions are templates, so cannot be represented in NVRTC
|
| 119 |
+
//builtins representation, so they have to be parsed on every NVRTC compilation.
|
| 120 |
+
//(notice 'EXCLUDE_FROM_RTC' ends above)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
#ifndef __NV_DISABLE_128_ATOMICS
|
| 124 |
+
// lgen definitions for 128b atomics
|
| 125 |
+
extern "C" {
|
| 126 |
+
__device__ __device_builtin__ void __u128AtomicCAS(void *, void *, void *, void *);
|
| 127 |
+
__device__ __device_builtin__ void __u128AtomicCAS_block(void *, void *, void *, void *);
|
| 128 |
+
__device__ __device_builtin__ void __u128AtomicCAS_system(void *, void *, void *, void *);
|
| 129 |
+
__device__ __device_builtin__ void __u128AtomicExch(void *, void *, void *);
|
| 130 |
+
__device__ __device_builtin__ void __u128AtomicExch_block(void *, void *, void *);
|
| 131 |
+
__device__ __device_builtin__ void __u128AtomicExch_system(void *, void *, void *);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
// macro to get address of object, to workaround situations where the type overloads the "&" operator
|
| 135 |
+
#define __NV_ATOMIC_ADDRESSOF(__val) \
|
| 136 |
+
(void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(__val))))
|
| 137 |
+
|
| 138 |
+
// enable_if
|
| 139 |
+
template<bool __b, typename _T>
|
| 140 |
+
struct __nv_atomic_enable_if { };
|
| 141 |
+
|
| 142 |
+
template<typename _T>
|
| 143 |
+
struct __nv_atomic_enable_if<true, _T> { typedef _T __type; };
|
| 144 |
+
|
| 145 |
+
// alignof
|
| 146 |
+
#if defined(__CUDACC_RTC__)
|
| 147 |
+
#define __NV_ATOMIC_ALIGNOF __alignof__
|
| 148 |
+
#else
|
| 149 |
+
#define __NV_ATOMIC_ALIGNOF __alignof
|
| 150 |
+
#endif
|
| 151 |
+
|
| 152 |
+
// trivially copyable
|
| 153 |
+
template <typename _T>
|
| 154 |
+
struct __nv_atomic_triv_cp_helper {
|
| 155 |
+
#if defined(__GNUC__)
|
| 156 |
+
#if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
|
| 157 |
+
static const bool __val = true;
|
| 158 |
+
#elif (__GNUC__ < 5)
|
| 159 |
+
static const bool __val = __has_trivial_copy(_T);
|
| 160 |
+
#else
|
| 161 |
+
static const bool __val = __is_trivially_copyable(_T);
|
| 162 |
+
#endif
|
| 163 |
+
#else
|
| 164 |
+
static const bool __val = __is_trivially_copyable(_T);
|
| 165 |
+
#endif
|
| 166 |
+
};
|
| 167 |
+
#define __NV_ATOMIC_TRIVIALLY_COPYABLE(_T) \
|
| 168 |
+
__nv_atomic_triv_cp_helper<_T>::__val
|
| 169 |
+
|
| 170 |
+
// return type
|
| 171 |
+
#if __cplusplus >= 202002L // C++20 or greater
|
| 172 |
+
#define __NV_ATOMIC_RET_TYPE(_T) _T
|
| 173 |
+
#else
|
| 174 |
+
#define __NV_ATOMIC_RET_TYPE(_T) typename \
|
| 175 |
+
__nv_atomic_enable_if<sizeof(_T) == 16 && \
|
| 176 |
+
__NV_ATOMIC_ALIGNOF(_T) >= 16 && \
|
| 177 |
+
__NV_ATOMIC_TRIVIALLY_COPYABLE(_T), _T>::__type
|
| 178 |
+
#endif
|
| 179 |
+
|
| 180 |
+
// requires
|
| 181 |
+
#if __cplusplus >= 202002L // C++20 or greater
|
| 182 |
+
#define __NV_ATOMIC_REQUIRES(_T) \
|
| 183 |
+
requires(sizeof(_T) == 16 && \
|
| 184 |
+
__NV_ATOMIC_ALIGNOF(_T) >= 16 && \
|
| 185 |
+
__NV_ATOMIC_TRIVIALLY_COPYABLE(_T))
|
| 186 |
+
#else
|
| 187 |
+
#define __NV_ATOMIC_REQUIRES(_T)
|
| 188 |
+
#endif
|
| 189 |
+
|
| 190 |
+
// temp value and return value
|
| 191 |
+
#if __cplusplus >= 201103L || defined(_MSC_VER) // C++11 or greater, or MSC
|
| 192 |
+
#define __NV_ATOMIC_TEMP(_T) union _U \
|
| 193 |
+
{_T __ret; __device__ __inline__ _U() {}}; _U __u
|
| 194 |
+
#define __NV_ATOMIC_RET(_T) __u.__ret
|
| 195 |
+
#else
|
| 196 |
+
#define __NV_ATOMIC_TEMP(_T) _T __ret
|
| 197 |
+
#define __NV_ATOMIC_RET(_T) __ret
|
| 198 |
+
#endif
|
| 199 |
+
|
| 200 |
+
// templated 128-bit atomics
|
| 201 |
+
template <typename _T>
|
| 202 |
+
__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
|
| 203 |
+
atomicCAS(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
|
| 204 |
+
__NV_ATOMIC_TEMP(_T);
|
| 205 |
+
__u128AtomicCAS((void *)(__address),
|
| 206 |
+
__NV_ATOMIC_ADDRESSOF(__compare),
|
| 207 |
+
__NV_ATOMIC_ADDRESSOF(__val),
|
| 208 |
+
__NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
|
| 209 |
+
return __NV_ATOMIC_RET(_T);
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
template <typename _T>
|
| 213 |
+
__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
|
| 214 |
+
atomicCAS_block(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
|
| 215 |
+
__NV_ATOMIC_TEMP(_T);
|
| 216 |
+
__u128AtomicCAS_block((void *)(__address),
|
| 217 |
+
__NV_ATOMIC_ADDRESSOF(__compare),
|
| 218 |
+
__NV_ATOMIC_ADDRESSOF(__val),
|
| 219 |
+
__NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
|
| 220 |
+
return __NV_ATOMIC_RET(_T);
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
template <typename _T>
|
| 224 |
+
__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
|
| 225 |
+
atomicCAS_system(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
|
| 226 |
+
__NV_ATOMIC_TEMP(_T);
|
| 227 |
+
__u128AtomicCAS_system((void *)(__address),
|
| 228 |
+
__NV_ATOMIC_ADDRESSOF(__compare),
|
| 229 |
+
__NV_ATOMIC_ADDRESSOF(__val),
|
| 230 |
+
__NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
|
| 231 |
+
return __NV_ATOMIC_RET(_T);
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
template <typename _T>
|
| 235 |
+
__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
|
| 236 |
+
atomicExch(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
|
| 237 |
+
__NV_ATOMIC_TEMP(_T);
|
| 238 |
+
__u128AtomicExch((void *)(__address),
|
| 239 |
+
__NV_ATOMIC_ADDRESSOF(__val),
|
| 240 |
+
__NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
|
| 241 |
+
return __NV_ATOMIC_RET(_T);
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
template <typename _T>
|
| 245 |
+
__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
|
| 246 |
+
atomicExch_block(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
|
| 247 |
+
__NV_ATOMIC_TEMP(_T);
|
| 248 |
+
__u128AtomicExch_block((void *)(__address),
|
| 249 |
+
__NV_ATOMIC_ADDRESSOF(__val),
|
| 250 |
+
__NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
|
| 251 |
+
return __NV_ATOMIC_RET(_T);
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
template <typename _T>
|
| 255 |
+
__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
|
| 256 |
+
atomicExch_system(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
|
| 257 |
+
__NV_ATOMIC_TEMP(_T);
|
| 258 |
+
__u128AtomicExch_system((void *)(__address),
|
| 259 |
+
__NV_ATOMIC_ADDRESSOF(__val),
|
| 260 |
+
__NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
|
| 261 |
+
return __NV_ATOMIC_RET(_T);
|
| 262 |
+
}
|
| 263 |
+
#endif /* !__NV_DISABLE_128_ATOMICS */
|
| 264 |
+
|
| 265 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
|
| 266 |
+
|
| 267 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 268 |
+
|
| 269 |
+
#undef __DEF_IF_HOST
|
| 270 |
+
#undef __SM_90_RT_DECL__
|
| 271 |
+
|
| 272 |
+
#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
|
| 273 |
+
#include "sm_90_rt.hpp"
|
| 274 |
+
#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
|
| 275 |
+
|
| 276 |
+
#endif /* !__SM_90_RT_H__ */
|
| 277 |
+
|
| 278 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__)
|
| 279 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 280 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
|
| 281 |
+
#endif
|
| 282 |
+
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("crt/sm_90_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "crt/sm_90_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#if !defined(__SM_90_RT_HPP__)
|
| 61 |
+
#define __SM_90_RT_HPP__
|
| 62 |
+
|
| 63 |
+
#if defined(__CUDACC_RTC__)
|
| 64 |
+
#define __SM_90_RT_DECL__ __host__ __device__
|
| 65 |
+
#else /* !__CUDACC_RTC__ */
|
| 66 |
+
#define __SM_90_RT_DECL__ static __device__ __inline__
|
| 67 |
+
#endif /* __CUDACC_RTC__ */
|
| 68 |
+
|
| 69 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 70 |
+
|
| 71 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
|
| 72 |
+
|
| 73 |
+
/*******************************************************************************
|
| 74 |
+
* *
|
| 75 |
+
* *
|
| 76 |
+
* *
|
| 77 |
+
*******************************************************************************/
|
| 78 |
+
|
| 79 |
+
#include "builtin_types.h"
|
| 80 |
+
#include "device_types.h"
|
| 81 |
+
#include "host_defines.h"
|
| 82 |
+
|
| 83 |
+
/*******************************************************************************
|
| 84 |
+
* *
|
| 85 |
+
* Below are implementations of SM-9.0 builtin functions which are included as *
|
| 86 |
+
* source (instead of being built in to the compiler) *
|
| 87 |
+
* *
|
| 88 |
+
*******************************************************************************/
|
| 89 |
+
extern "C" {
|
| 90 |
+
__device__ unsigned __nv_isClusterShared_impl(const void *);
|
| 91 |
+
__device__ void * __nv_cluster_map_shared_rank_impl(const void *, unsigned);
|
| 92 |
+
__device__ unsigned __nv_cluster_query_shared_rank_impl(const void *);
|
| 93 |
+
__device__ unsigned __nv_clusterDimIsSpecifed_impl();
|
| 94 |
+
__device__ void __nv_clusterDim_impl(unsigned *, unsigned *, unsigned *);
|
| 95 |
+
__device__ void __nv_clusterRelativeBlockIdx_impl(unsigned *,
|
| 96 |
+
unsigned *, unsigned *);
|
| 97 |
+
__device__ void __nv_clusterGridDimInClusters_impl(unsigned *,
|
| 98 |
+
unsigned *, unsigned *);
|
| 99 |
+
__device__ void __nv_clusterIdx_impl(unsigned *, unsigned *, unsigned *);
|
| 100 |
+
__device__ unsigned __nv_clusterRelativeBlockRank_impl();
|
| 101 |
+
__device__ unsigned __nv_clusterSizeInBlocks_impl();
|
| 102 |
+
__device__ void __nv_cluster_barrier_arrive_impl();
|
| 103 |
+
__device__ void __nv_cluster_barrier_arrive_relaxed_impl();
|
| 104 |
+
__device__ void __nv_cluster_barrier_wait_impl();
|
| 105 |
+
__device__ void __nv_threadfence_cluster_impl();
|
| 106 |
+
|
| 107 |
+
__device__ __device_builtin__ float2 __f2AtomicAdd(float2 *, float2);
|
| 108 |
+
__device__ __device_builtin__ float2 __f2AtomicAdd_block(float2 *, float2);
|
| 109 |
+
__device__ __device_builtin__ float2 __f2AtomicAdd_system(float2 *, float2);
|
| 110 |
+
__device__ __device_builtin__ float4 __f4AtomicAdd(float4 *, float4);
|
| 111 |
+
__device__ __device_builtin__ float4 __f4AtomicAdd_block(float4 *, float4);
|
| 112 |
+
__device__ __device_builtin__ float4 __f4AtomicAdd_system(float4 *, float4);
|
| 113 |
+
} // extern "C"
|
| 114 |
+
|
| 115 |
+
__SM_90_RT_DECL__ unsigned __isCtaShared(const void *ptr)
|
| 116 |
+
{
|
| 117 |
+
return __isShared(ptr);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr)
|
| 121 |
+
{
|
| 122 |
+
return __nv_isClusterShared_impl(ptr);
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr,
|
| 126 |
+
unsigned target_block_rank)
|
| 127 |
+
{
|
| 128 |
+
return __nv_cluster_map_shared_rank_impl(ptr, target_block_rank);
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr)
|
| 132 |
+
{
|
| 133 |
+
return __nv_cluster_query_shared_rank_impl(ptr);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr,
|
| 137 |
+
unsigned int cluster_cta_mask)
|
| 138 |
+
{
|
| 139 |
+
return make_uint2((unsigned)__cvta_generic_to_shared(ptr), cluster_cta_mask);
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified()
|
| 143 |
+
{
|
| 144 |
+
return __nv_clusterDimIsSpecifed_impl();
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
__SM_90_RT_DECL__ dim3 __clusterDim()
|
| 148 |
+
{
|
| 149 |
+
unsigned x, y, z;
|
| 150 |
+
__nv_clusterDim_impl(&x, &y, &z);
|
| 151 |
+
return dim3(x,y,z);
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx()
|
| 155 |
+
{
|
| 156 |
+
unsigned x, y, z;
|
| 157 |
+
__nv_clusterRelativeBlockIdx_impl(&x, &y, &z);
|
| 158 |
+
return dim3(x,y,z);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters()
|
| 162 |
+
{
|
| 163 |
+
unsigned x, y, z;
|
| 164 |
+
__nv_clusterGridDimInClusters_impl(&x, &y, &z);
|
| 165 |
+
return dim3(x,y,z);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
__SM_90_RT_DECL__ dim3 __clusterIdx()
|
| 169 |
+
{
|
| 170 |
+
unsigned x, y, z;
|
| 171 |
+
__nv_clusterIdx_impl(&x, &y, &z);
|
| 172 |
+
return dim3(x,y,z);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank()
|
| 176 |
+
{
|
| 177 |
+
return __nv_clusterRelativeBlockRank_impl();
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks()
|
| 181 |
+
{
|
| 182 |
+
return __nv_clusterSizeInBlocks_impl();
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
__SM_90_RT_DECL__ void __cluster_barrier_arrive()
|
| 186 |
+
{
|
| 187 |
+
__nv_cluster_barrier_arrive_impl();
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed()
|
| 191 |
+
{
|
| 192 |
+
__nv_cluster_barrier_arrive_relaxed_impl();
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
__SM_90_RT_DECL__ void __cluster_barrier_wait()
|
| 196 |
+
{
|
| 197 |
+
__nv_cluster_barrier_wait_impl();
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
__SM_90_RT_DECL__ void __threadfence_cluster()
|
| 201 |
+
{
|
| 202 |
+
__nv_threadfence_cluster_impl();
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
/* Define __PTR for atomicAdd prototypes below, undef after done */
|
| 207 |
+
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
|
| 208 |
+
#define __PTR "l"
|
| 209 |
+
#else
|
| 210 |
+
#define __PTR "r"
|
| 211 |
+
#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
|
| 212 |
+
|
| 213 |
+
__SM_90_RT_DECL__ float2 atomicAdd(float2 *address, float2 val) {
|
| 214 |
+
return __f2AtomicAdd(address, val);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *address, float2 val) {
|
| 218 |
+
return __f2AtomicAdd_block(address, val);
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *address, float2 val) {
|
| 222 |
+
return __f2AtomicAdd_system(address, val);
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
__SM_90_RT_DECL__ float4 atomicAdd(float4 *address, float4 val) {
|
| 226 |
+
return __f4AtomicAdd(address, val);
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *address, float4 val) {
|
| 230 |
+
return __f4AtomicAdd_block(address, val);
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *address, float4 val) {
|
| 234 |
+
return __f4AtomicAdd_system(address, val);
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
|
| 238 |
+
|
| 239 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 240 |
+
|
| 241 |
+
#undef __SM_90_RT_DECL__
|
| 242 |
+
|
| 243 |
+
#endif /* !__SM_90_RT_HPP__ */
|
| 244 |
+
|
| 245 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__)
|
| 246 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 247 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
|
| 248 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/storage_class.h
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* NVIDIA_COPYRIGHT_BEGIN
|
| 3 |
+
*
|
| 4 |
+
* Copyright (c) 2008-2018, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
+
*
|
| 6 |
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 7 |
+
* and proprietary rights in and to this software, related documentation
|
| 8 |
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
| 9 |
+
* distribution of this software and related documentation without an express
|
| 10 |
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 11 |
+
*
|
| 12 |
+
* NVIDIA_COPYRIGHT_END
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 16 |
+
#if defined(_MSC_VER)
|
| 17 |
+
#pragma message("crt/storage_class.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 18 |
+
#else
|
| 19 |
+
#warning "crt/storage_class.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 20 |
+
#endif
|
| 21 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 22 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#if !defined(__STORAGE_CLASS_H__)
|
| 26 |
+
#define __STORAGE_CLASS_H__
|
| 27 |
+
|
| 28 |
+
#if !defined(__var_used__)
|
| 29 |
+
|
| 30 |
+
#define __var_used__
|
| 31 |
+
|
| 32 |
+
#endif /* __var_used__ */
|
| 33 |
+
|
| 34 |
+
#if !defined(__loc_sc__)
|
| 35 |
+
|
| 36 |
+
#define __loc_sc__(loc, size, sc) \
|
| 37 |
+
__storage##_##sc##size##loc loc
|
| 38 |
+
|
| 39 |
+
#endif /* !__loc_sc__ */
|
| 40 |
+
|
| 41 |
+
#if !defined(__storage___device__)
|
| 42 |
+
#define __storage___device__ static __var_used__
|
| 43 |
+
#endif /* __storage___device__ */
|
| 44 |
+
|
| 45 |
+
#if !defined(__storage_extern__device__)
|
| 46 |
+
#define __storage_extern__device__ static __var_used__
|
| 47 |
+
#endif /* __storage_extern__device__ */
|
| 48 |
+
|
| 49 |
+
#if !defined(__storage_auto__device__)
|
| 50 |
+
#define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
|
| 51 |
+
#endif /* __storage_auto__device__ */
|
| 52 |
+
|
| 53 |
+
#if !defined(__storage_static__device__)
|
| 54 |
+
#define __storage_static__device__ static __var_used__
|
| 55 |
+
#endif /* __storage_static__device__ */
|
| 56 |
+
|
| 57 |
+
#if !defined(__storage___constant__)
|
| 58 |
+
#define __storage___constant__ static __var_used__
|
| 59 |
+
#endif /* __storage___constant__ */
|
| 60 |
+
|
| 61 |
+
#if !defined(__storage_extern__constant__)
|
| 62 |
+
#define __storage_extern__constant__ static __var_used__
|
| 63 |
+
#endif /* __storage_extern__constant__ */
|
| 64 |
+
|
| 65 |
+
#if !defined(__storage_auto__constant__)
|
| 66 |
+
#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
|
| 67 |
+
#endif /* __storage_auto__constant__ */
|
| 68 |
+
|
| 69 |
+
#if !defined(__storage_static__constant__)
|
| 70 |
+
#define __storage_static__constant__ static __var_used__
|
| 71 |
+
#endif /* __storage_static__constant__ */
|
| 72 |
+
|
| 73 |
+
#if !defined(__storage___shared__)
|
| 74 |
+
#define __storage___shared__ static __var_used__
|
| 75 |
+
#endif /* __storage___shared__ */
|
| 76 |
+
|
| 77 |
+
#if !defined(__storage_extern__shared__)
|
| 78 |
+
#define __storage_extern__shared__ static __var_used__
|
| 79 |
+
#endif /* __storage_extern__shared__ */
|
| 80 |
+
|
| 81 |
+
#if !defined(__storage_auto__shared__)
|
| 82 |
+
#define __storage_auto__shared__ static
|
| 83 |
+
#endif /* __storage_auto__shared__ */
|
| 84 |
+
|
| 85 |
+
#if !defined(__storage_static__shared__)
|
| 86 |
+
#define __storage_static__shared__ static __var_used__
|
| 87 |
+
#endif /* __storage_static__shared__ */
|
| 88 |
+
|
| 89 |
+
#if !defined(__storage__unsized__shared__)
|
| 90 |
+
#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
|
| 91 |
+
#endif /* __storage__unsized__shared__ */
|
| 92 |
+
|
| 93 |
+
#if !defined(__storage_extern_unsized__shared__)
|
| 94 |
+
#define __storage_extern_unsized__shared__ static __var_used__
|
| 95 |
+
#endif /* __storage_extern_unsized__shared__ */
|
| 96 |
+
|
| 97 |
+
#if !defined(__storage_auto_unsized__shared__)
|
| 98 |
+
#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
|
| 99 |
+
#endif /* __storage_auto_unsized__shared__ */
|
| 100 |
+
|
| 101 |
+
#if !defined(__storage_static_unsized__shared__)
|
| 102 |
+
#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
|
| 103 |
+
#endif /* __storage_static_unsized__shared__ */
|
| 104 |
+
|
| 105 |
+
#if !defined(__storage___text__)
|
| 106 |
+
#define __storage___text__ static __var_used__
|
| 107 |
+
#endif /* __storage___text__ */
|
| 108 |
+
|
| 109 |
+
#if !defined(__storage_extern__text__)
|
| 110 |
+
#define __storage_extern__text__ static __var_used__
|
| 111 |
+
#endif /* __storage_extern__text__ */
|
| 112 |
+
|
| 113 |
+
#if !defined(__storage_auto__text__)
|
| 114 |
+
#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
|
| 115 |
+
#endif /* __storage_auto__text__ */
|
| 116 |
+
|
| 117 |
+
#if !defined(__storage_static__text__)
|
| 118 |
+
#define __storage_static__text__ static __var_used__
|
| 119 |
+
#endif /* __storage_static__text__ */
|
| 120 |
+
|
| 121 |
+
#if !defined(__storage___surf__)
|
| 122 |
+
#define __storage___surf__ static __var_used__
|
| 123 |
+
#endif /* __storage___surf__ */
|
| 124 |
+
|
| 125 |
+
#if !defined(__storage_extern__surf__)
|
| 126 |
+
#define __storage_extern__surf__ static __var_used__
|
| 127 |
+
#endif /* __storage_extern__surf__ */
|
| 128 |
+
|
| 129 |
+
#if !defined(__storage_auto__surf__)
|
| 130 |
+
#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
|
| 131 |
+
#endif /* __storage_auto__surf__ */
|
| 132 |
+
|
| 133 |
+
#if !defined(__storage_static__surf__)
|
| 134 |
+
#define __storage_static__surf__ static __var_used__
|
| 135 |
+
#endif /* __storage_static__surf__ */
|
| 136 |
+
|
| 137 |
+
#endif /* !__STORAGE_CLASS_H__ */
|
| 138 |
+
|
| 139 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__)
|
| 140 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 141 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
|
| 142 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGL.h
ADDED
|
@@ -0,0 +1,608 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAGL_H
|
| 51 |
+
#define CUDAGL_H
|
| 52 |
+
|
| 53 |
+
#include <cuda.h>
|
| 54 |
+
#include <GL/gl.h>
|
| 55 |
+
|
| 56 |
+
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
|
| 57 |
+
#define __CUDA_DEPRECATED
|
| 58 |
+
#elif defined(_MSC_VER)
|
| 59 |
+
#define __CUDA_DEPRECATED __declspec(deprecated)
|
| 60 |
+
#elif defined(__GNUC__)
|
| 61 |
+
#define __CUDA_DEPRECATED __attribute__((deprecated))
|
| 62 |
+
#else
|
| 63 |
+
#define __CUDA_DEPRECATED
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
#ifdef CUDA_FORCE_API_VERSION
|
| 67 |
+
#error "CUDA_FORCE_API_VERSION is no longer supported."
|
| 68 |
+
#endif
|
| 69 |
+
|
| 70 |
+
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
| 71 |
+
#define __CUDA_API_PER_THREAD_DEFAULT_STREAM
|
| 72 |
+
#define __CUDA_API_PTDS(api) api ## _ptds
|
| 73 |
+
#define __CUDA_API_PTSZ(api) api ## _ptsz
|
| 74 |
+
#else
|
| 75 |
+
#define __CUDA_API_PTDS(api) api
|
| 76 |
+
#define __CUDA_API_PTSZ(api) api
|
| 77 |
+
#endif
|
| 78 |
+
|
| 79 |
+
#define cuGLCtxCreate cuGLCtxCreate_v2
|
| 80 |
+
#define cuGLMapBufferObject __CUDA_API_PTDS(cuGLMapBufferObject_v2)
|
| 81 |
+
#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
|
| 82 |
+
#define cuGLGetDevices cuGLGetDevices_v2
|
| 83 |
+
|
| 84 |
+
#ifdef __cplusplus
|
| 85 |
+
extern "C" {
|
| 86 |
+
#endif
|
| 87 |
+
|
| 88 |
+
/**
|
| 89 |
+
* \file cudaGL.h
|
| 90 |
+
* \brief Header file for the OpenGL interoperability functions of the
|
| 91 |
+
* low-level CUDA driver application programming interface.
|
| 92 |
+
*/
|
| 93 |
+
|
| 94 |
+
/**
|
| 95 |
+
* \defgroup CUDA_GL OpenGL Interoperability
|
| 96 |
+
* \ingroup CUDA_DRIVER
|
| 97 |
+
*
|
| 98 |
+
* ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
|
| 99 |
+
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
| 100 |
+
*
|
| 101 |
+
* This section describes the OpenGL interoperability functions of the
|
| 102 |
+
* low-level CUDA driver application programming interface. Note that mapping
|
| 103 |
+
* of OpenGL resources is performed with the graphics API agnostic, resource
|
| 104 |
+
* mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
|
| 105 |
+
*
|
| 106 |
+
* @{
|
| 107 |
+
*/
|
| 108 |
+
|
| 109 |
+
#if defined(_WIN32)
|
| 110 |
+
#if !defined(WGL_NV_gpu_affinity)
|
| 111 |
+
typedef void* HGPUNV;
|
| 112 |
+
#endif
|
| 113 |
+
#endif /* _WIN32 */
|
| 114 |
+
|
| 115 |
+
/**
|
| 116 |
+
* \brief Registers an OpenGL buffer object
|
| 117 |
+
*
|
| 118 |
+
* Registers the buffer object specified by \p buffer for access by
|
| 119 |
+
* CUDA. A handle to the registered object is returned as \p
|
| 120 |
+
* pCudaResource. The register flags \p Flags specify the intended usage,
|
| 121 |
+
* as follows:
|
| 122 |
+
*
|
| 123 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
|
| 124 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 125 |
+
* read from and written to by CUDA. This is the default value.
|
| 126 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
|
| 127 |
+
* will not write to this resource.
|
| 128 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
|
| 129 |
+
* CUDA will not read from this resource and will write over the
|
| 130 |
+
* entire contents of the resource, so none of the data previously
|
| 131 |
+
* stored in the resource will be preserved.
|
| 132 |
+
*
|
| 133 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 134 |
+
* \param buffer - name of buffer object to be registered
|
| 135 |
+
* \param Flags - Register flags
|
| 136 |
+
*
|
| 137 |
+
* \return
|
| 138 |
+
* ::CUDA_SUCCESS,
|
| 139 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 140 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 141 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 142 |
+
* ::CUDA_ERROR_OPERATING_SYSTEM
|
| 143 |
+
* \notefnerr
|
| 144 |
+
*
|
| 145 |
+
* \sa
|
| 146 |
+
* ::cuGraphicsUnregisterResource,
|
| 147 |
+
* ::cuGraphicsMapResources,
|
| 148 |
+
* ::cuGraphicsResourceGetMappedPointer,
|
| 149 |
+
* ::cudaGraphicsGLRegisterBuffer
|
| 150 |
+
*/
|
| 151 |
+
CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
|
| 152 |
+
|
| 153 |
+
/**
|
| 154 |
+
* \brief Register an OpenGL texture or renderbuffer object
|
| 155 |
+
*
|
| 156 |
+
* Registers the texture or renderbuffer object specified by \p image for access by CUDA.
|
| 157 |
+
* A handle to the registered object is returned as \p pCudaResource.
|
| 158 |
+
*
|
| 159 |
+
* \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
|
| 160 |
+
* ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
|
| 161 |
+
* or ::GL_RENDERBUFFER.
|
| 162 |
+
*
|
| 163 |
+
* The register flags \p Flags specify the intended usage, as follows:
|
| 164 |
+
*
|
| 165 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
|
| 166 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 167 |
+
* read from and written to by CUDA. This is the default value.
|
| 168 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
|
| 169 |
+
* will not write to this resource.
|
| 170 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
|
| 171 |
+
* CUDA will not read from this resource and will write over the
|
| 172 |
+
* entire contents of the resource, so none of the data previously
|
| 173 |
+
* stored in the resource will be preserved.
|
| 174 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
|
| 175 |
+
* bind this resource to a surface reference.
|
| 176 |
+
* - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
|
| 177 |
+
* texture gather operations on this resource.
|
| 178 |
+
*
|
| 179 |
+
* The following image formats are supported. For brevity's sake, the list is abbreviated.
|
| 180 |
+
* For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
|
| 181 |
+
* {GL_R8, GL_R16, GL_RG8, GL_RG16} :
|
| 182 |
+
* - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
|
| 183 |
+
* - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
|
| 184 |
+
* - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
|
| 185 |
+
* {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
|
| 186 |
+
*
|
| 187 |
+
* The following image classes are currently disallowed:
|
| 188 |
+
* - Textures with borders
|
| 189 |
+
* - Multisampled renderbuffers
|
| 190 |
+
*
|
| 191 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 192 |
+
* \param image - name of texture or renderbuffer object to be registered
|
| 193 |
+
* \param target - Identifies the type of object specified by \p image
|
| 194 |
+
* \param Flags - Register flags
|
| 195 |
+
*
|
| 196 |
+
* \return
|
| 197 |
+
* ::CUDA_SUCCESS,
|
| 198 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 199 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 200 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 201 |
+
* ::CUDA_ERROR_OPERATING_SYSTEM
|
| 202 |
+
* \notefnerr
|
| 203 |
+
*
|
| 204 |
+
* \sa
|
| 205 |
+
* ::cuGraphicsUnregisterResource,
|
| 206 |
+
* ::cuGraphicsMapResources,
|
| 207 |
+
* ::cuGraphicsSubResourceGetMappedArray,
|
| 208 |
+
* ::cudaGraphicsGLRegisterImage
|
| 209 |
+
*/
|
| 210 |
+
CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
|
| 211 |
+
|
| 212 |
+
#ifdef _WIN32
|
| 213 |
+
/**
|
| 214 |
+
* \brief Gets the CUDA device associated with hGpu
|
| 215 |
+
*
|
| 216 |
+
* Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
|
| 217 |
+
* applicable.
|
| 218 |
+
*
|
| 219 |
+
* \param pDevice - Device associated with hGpu
|
| 220 |
+
* \param hGpu - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
|
| 221 |
+
*
|
| 222 |
+
* \return
|
| 223 |
+
* ::CUDA_SUCCESS,
|
| 224 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 225 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 226 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 227 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 228 |
+
* \notefnerr
|
| 229 |
+
*
|
| 230 |
+
* \sa ::cuGLMapBufferObject,
|
| 231 |
+
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
|
| 232 |
+
* ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
|
| 233 |
+
* ::cuGLSetBufferObjectMapFlags,
|
| 234 |
+
* ::cudaWGLGetDevice
|
| 235 |
+
*/
|
| 236 |
+
CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
|
| 237 |
+
#endif /* _WIN32 */
|
| 238 |
+
|
| 239 |
+
/**
|
| 240 |
+
* CUDA devices corresponding to an OpenGL device
|
| 241 |
+
*/
|
| 242 |
+
typedef enum CUGLDeviceList_enum {
|
| 243 |
+
CU_GL_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
|
| 244 |
+
CU_GL_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
|
| 245 |
+
CU_GL_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
|
| 246 |
+
} CUGLDeviceList;
|
| 247 |
+
|
| 248 |
+
/**
|
| 249 |
+
* \brief Gets the CUDA devices associated with the current OpenGL context
|
| 250 |
+
*
|
| 251 |
+
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
|
| 252 |
+
* corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
|
| 253 |
+
* at most cudaDeviceCount of the CUDA-compatible devices corresponding to
|
| 254 |
+
* the current OpenGL context. If any of the GPUs being used by the current OpenGL
|
| 255 |
+
* context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
|
| 256 |
+
*
|
| 257 |
+
* The \p deviceList argument may be any of the following:
|
| 258 |
+
* - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
|
| 259 |
+
* - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
|
| 260 |
+
* render the current frame (in SLI).
|
| 261 |
+
* - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
|
| 262 |
+
* render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
|
| 263 |
+
* this is correct in all cases.
|
| 264 |
+
*
|
| 265 |
+
* \param pCudaDeviceCount - Returned number of CUDA devices.
|
| 266 |
+
* \param pCudaDevices - Returned CUDA devices.
|
| 267 |
+
* \param cudaDeviceCount - The size of the output device array pCudaDevices.
|
| 268 |
+
* \param deviceList - The set of devices to return.
|
| 269 |
+
*
|
| 270 |
+
* \return
|
| 271 |
+
* ::CUDA_SUCCESS,
|
| 272 |
+
* ::CUDA_ERROR_NO_DEVICE,
|
| 273 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 274 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 275 |
+
* ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
|
| 276 |
+
* ::CUDA_ERROR_OPERATING_SYSTEM
|
| 277 |
+
*
|
| 278 |
+
* \notefnerr
|
| 279 |
+
*
|
| 280 |
+
* \sa
|
| 281 |
+
* ::cuWGLGetDevice,
|
| 282 |
+
* ::cudaGLGetDevices
|
| 283 |
+
*/
|
| 284 |
+
CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 285 |
+
|
| 286 |
+
/**
|
| 287 |
+
* \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
|
| 288 |
+
*
|
| 289 |
+
* ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
|
| 290 |
+
* CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
| 291 |
+
*
|
| 292 |
+
* This section describes deprecated OpenGL interoperability functionality.
|
| 293 |
+
*
|
| 294 |
+
* @{
|
| 295 |
+
*/
|
| 296 |
+
|
| 297 |
+
/** Flags to map or unmap a resource */
|
| 298 |
+
typedef enum CUGLmap_flags_enum {
|
| 299 |
+
CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
|
| 300 |
+
CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
|
| 301 |
+
CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
|
| 302 |
+
} CUGLmap_flags;
|
| 303 |
+
|
| 304 |
+
/**
|
| 305 |
+
* \brief Create a CUDA context for interoperability with OpenGL
|
| 306 |
+
*
|
| 307 |
+
* \deprecated This function is deprecated as of Cuda 5.0.
|
| 308 |
+
*
|
| 309 |
+
* This function is deprecated and should no longer be used. It is
|
| 310 |
+
* no longer necessary to associate a CUDA context with an OpenGL
|
| 311 |
+
* context in order to achieve maximum interoperability performance.
|
| 312 |
+
*
|
| 313 |
+
* \param pCtx - Returned CUDA context
|
| 314 |
+
* \param Flags - Options for CUDA context creation
|
| 315 |
+
* \param device - Device on which to create the context
|
| 316 |
+
*
|
| 317 |
+
* \return
|
| 318 |
+
* ::CUDA_SUCCESS,
|
| 319 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 320 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 321 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 322 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 323 |
+
* ::CUDA_ERROR_OUT_OF_MEMORY
|
| 324 |
+
* \notefnerr
|
| 325 |
+
*
|
| 326 |
+
* \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
|
| 327 |
+
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
|
| 328 |
+
* ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
|
| 329 |
+
* ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
|
| 330 |
+
* ::cuWGLGetDevice
|
| 331 |
+
*/
|
| 332 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
|
| 333 |
+
|
| 334 |
+
/**
|
| 335 |
+
* \brief Initializes OpenGL interoperability
|
| 336 |
+
*
|
| 337 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 338 |
+
*
|
| 339 |
+
* Initializes OpenGL interoperability. This function is deprecated
|
| 340 |
+
* and calling it is no longer required. It may fail if the needed
|
| 341 |
+
* OpenGL driver facilities are not available.
|
| 342 |
+
*
|
| 343 |
+
* \return
|
| 344 |
+
* ::CUDA_SUCCESS,
|
| 345 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 346 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 347 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 348 |
+
* ::CUDA_ERROR_UNKNOWN
|
| 349 |
+
* \notefnerr
|
| 350 |
+
*
|
| 351 |
+
* \sa ::cuGLMapBufferObject,
|
| 352 |
+
* ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
|
| 353 |
+
* ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
|
| 354 |
+
* ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
|
| 355 |
+
* ::cuWGLGetDevice
|
| 356 |
+
*/
|
| 357 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
|
| 358 |
+
|
| 359 |
+
/**
|
| 360 |
+
* \brief Registers an OpenGL buffer object
|
| 361 |
+
*
|
| 362 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 363 |
+
*
|
| 364 |
+
* Registers the buffer object specified by \p buffer for access by
|
| 365 |
+
* CUDA. This function must be called before CUDA can map the buffer
|
| 366 |
+
* object. There must be a valid OpenGL context bound to the current
|
| 367 |
+
* thread when this function is called, and the buffer name is
|
| 368 |
+
* resolved by that context.
|
| 369 |
+
*
|
| 370 |
+
* \param buffer - The name of the buffer object to register.
|
| 371 |
+
*
|
| 372 |
+
* \return
|
| 373 |
+
* ::CUDA_SUCCESS,
|
| 374 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 375 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 376 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 377 |
+
* ::CUDA_ERROR_ALREADY_MAPPED
|
| 378 |
+
* \notefnerr
|
| 379 |
+
*
|
| 380 |
+
* \sa ::cuGraphicsGLRegisterBuffer
|
| 381 |
+
*/
|
| 382 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
|
| 383 |
+
|
| 384 |
+
/**
|
| 385 |
+
* \brief Maps an OpenGL buffer object
|
| 386 |
+
*
|
| 387 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 388 |
+
*
|
| 389 |
+
* Maps the buffer object specified by \p buffer into the address space of the
|
| 390 |
+
* current CUDA context and returns in \p *dptr and \p *size the base pointer
|
| 391 |
+
* and size of the resulting mapping.
|
| 392 |
+
*
|
| 393 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 394 |
+
* when this function is called. This must be the same context, or a
|
| 395 |
+
* member of the same shareGroup, as the context that was bound when
|
| 396 |
+
* the buffer was registered.
|
| 397 |
+
*
|
| 398 |
+
* All streams in the current CUDA context are synchronized with the
|
| 399 |
+
* current GL context.
|
| 400 |
+
*
|
| 401 |
+
* \param dptr - Returned mapped base pointer
|
| 402 |
+
* \param size - Returned size of mapping
|
| 403 |
+
* \param buffer - The name of the buffer object to map
|
| 404 |
+
*
|
| 405 |
+
* \return
|
| 406 |
+
* ::CUDA_SUCCESS,
|
| 407 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 408 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 409 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 410 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 411 |
+
* ::CUDA_ERROR_MAP_FAILED
|
| 412 |
+
* \notefnerr
|
| 413 |
+
*
|
| 414 |
+
* \sa ::cuGraphicsMapResources
|
| 415 |
+
*/
|
| 416 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size, GLuint buffer);
|
| 417 |
+
|
| 418 |
+
/**
|
| 419 |
+
* \brief Unmaps an OpenGL buffer object
|
| 420 |
+
*
|
| 421 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 422 |
+
*
|
| 423 |
+
* Unmaps the buffer object specified by \p buffer for access by CUDA.
|
| 424 |
+
*
|
| 425 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 426 |
+
* when this function is called. This must be the same context, or a
|
| 427 |
+
* member of the same shareGroup, as the context that was bound when
|
| 428 |
+
* the buffer was registered.
|
| 429 |
+
*
|
| 430 |
+
* All streams in the current CUDA context are synchronized with the
|
| 431 |
+
* current GL context.
|
| 432 |
+
*
|
| 433 |
+
* \param buffer - Buffer object to unmap
|
| 434 |
+
*
|
| 435 |
+
* \return
|
| 436 |
+
* ::CUDA_SUCCESS,
|
| 437 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 438 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 439 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 440 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 441 |
+
* \notefnerr
|
| 442 |
+
*
|
| 443 |
+
* \sa ::cuGraphicsUnmapResources
|
| 444 |
+
*/
|
| 445 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
|
| 446 |
+
|
| 447 |
+
/**
|
| 448 |
+
* \brief Unregister an OpenGL buffer object
|
| 449 |
+
*
|
| 450 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 451 |
+
*
|
| 452 |
+
* Unregisters the buffer object specified by \p buffer. This
|
| 453 |
+
* releases any resources associated with the registered buffer.
|
| 454 |
+
* After this call, the buffer may no longer be mapped for access by
|
| 455 |
+
* CUDA.
|
| 456 |
+
*
|
| 457 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 458 |
+
* when this function is called. This must be the same context, or a
|
| 459 |
+
* member of the same shareGroup, as the context that was bound when
|
| 460 |
+
* the buffer was registered.
|
| 461 |
+
*
|
| 462 |
+
* \param buffer - Name of the buffer object to unregister
|
| 463 |
+
*
|
| 464 |
+
* \return
|
| 465 |
+
* ::CUDA_SUCCESS,
|
| 466 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 467 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 468 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 469 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 470 |
+
* \notefnerr
|
| 471 |
+
*
|
| 472 |
+
* \sa ::cuGraphicsUnregisterResource
|
| 473 |
+
*/
|
| 474 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
|
| 475 |
+
|
| 476 |
+
/**
|
| 477 |
+
* \brief Set the map flags for an OpenGL buffer object
|
| 478 |
+
*
|
| 479 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 480 |
+
*
|
| 481 |
+
* Sets the map flags for the buffer object specified by \p buffer.
|
| 482 |
+
*
|
| 483 |
+
* Changes to \p Flags will take effect the next time \p buffer is mapped.
|
| 484 |
+
* The \p Flags argument may be any of the following:
|
| 485 |
+
* - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
|
| 486 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 487 |
+
* read from and written to by CUDA kernels. This is the default value.
|
| 488 |
+
* - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
|
| 489 |
+
* access this resource will not write to this resource.
|
| 490 |
+
* - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
|
| 491 |
+
* which access this resource will not read from this resource and will
|
| 492 |
+
* write over the entire contents of the resource, so none of the data
|
| 493 |
+
* previously stored in the resource will be preserved.
|
| 494 |
+
*
|
| 495 |
+
* If \p buffer has not been registered for use with CUDA, then
|
| 496 |
+
* ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
|
| 497 |
+
* mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
|
| 498 |
+
*
|
| 499 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 500 |
+
* when this function is called. This must be the same context, or a
|
| 501 |
+
* member of the same shareGroup, as the context that was bound when
|
| 502 |
+
* the buffer was registered.
|
| 503 |
+
*
|
| 504 |
+
* \param buffer - Buffer object to unmap
|
| 505 |
+
* \param Flags - Map flags
|
| 506 |
+
*
|
| 507 |
+
* \return
|
| 508 |
+
* ::CUDA_SUCCESS,
|
| 509 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 510 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 511 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 512 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 513 |
+
* \notefnerr
|
| 514 |
+
*
|
| 515 |
+
* \sa ::cuGraphicsResourceSetMapFlags
|
| 516 |
+
*/
|
| 517 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
|
| 518 |
+
|
| 519 |
+
/**
|
| 520 |
+
* \brief Maps an OpenGL buffer object
|
| 521 |
+
*
|
| 522 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 523 |
+
*
|
| 524 |
+
* Maps the buffer object specified by \p buffer into the address space of the
|
| 525 |
+
* current CUDA context and returns in \p *dptr and \p *size the base pointer
|
| 526 |
+
* and size of the resulting mapping.
|
| 527 |
+
*
|
| 528 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 529 |
+
* when this function is called. This must be the same context, or a
|
| 530 |
+
* member of the same shareGroup, as the context that was bound when
|
| 531 |
+
* the buffer was registered.
|
| 532 |
+
*
|
| 533 |
+
* Stream \p hStream in the current CUDA context is synchronized with
|
| 534 |
+
* the current GL context.
|
| 535 |
+
*
|
| 536 |
+
* \param dptr - Returned mapped base pointer
|
| 537 |
+
* \param size - Returned size of mapping
|
| 538 |
+
* \param buffer - The name of the buffer object to map
|
| 539 |
+
* \param hStream - Stream to synchronize
|
| 540 |
+
*
|
| 541 |
+
* \return
|
| 542 |
+
* ::CUDA_SUCCESS,
|
| 543 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 544 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 545 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 546 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 547 |
+
* ::CUDA_ERROR_MAP_FAILED
|
| 548 |
+
* \notefnerr
|
| 549 |
+
*
|
| 550 |
+
* \sa ::cuGraphicsMapResources
|
| 551 |
+
*/
|
| 552 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 553 |
+
|
| 554 |
+
/**
|
| 555 |
+
* \brief Unmaps an OpenGL buffer object
|
| 556 |
+
*
|
| 557 |
+
* \deprecated This function is deprecated as of Cuda 3.0.
|
| 558 |
+
*
|
| 559 |
+
* Unmaps the buffer object specified by \p buffer for access by CUDA.
|
| 560 |
+
*
|
| 561 |
+
* There must be a valid OpenGL context bound to the current thread
|
| 562 |
+
* when this function is called. This must be the same context, or a
|
| 563 |
+
* member of the same shareGroup, as the context that was bound when
|
| 564 |
+
* the buffer was registered.
|
| 565 |
+
*
|
| 566 |
+
* Stream \p hStream in the current CUDA context is synchronized with
|
| 567 |
+
* the current GL context.
|
| 568 |
+
*
|
| 569 |
+
* \param buffer - Name of the buffer object to unmap
|
| 570 |
+
* \param hStream - Stream to synchronize
|
| 571 |
+
*
|
| 572 |
+
* \return
|
| 573 |
+
* ::CUDA_SUCCESS,
|
| 574 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 575 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 576 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 577 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 578 |
+
* \notefnerr
|
| 579 |
+
*
|
| 580 |
+
* \sa ::cuGraphicsUnmapResources
|
| 581 |
+
*/
|
| 582 |
+
__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
|
| 583 |
+
|
| 584 |
+
/** @} */ /* END CUDA_GL_DEPRECATED */
|
| 585 |
+
/** @} */ /* END CUDA_GL */
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 589 |
+
#undef cuGLCtxCreate
|
| 590 |
+
#undef cuGLMapBufferObject
|
| 591 |
+
#undef cuGLMapBufferObjectAsync
|
| 592 |
+
#undef cuGLGetDevices
|
| 593 |
+
|
| 594 |
+
CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 595 |
+
CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer);
|
| 596 |
+
CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 597 |
+
CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
|
| 598 |
+
CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
|
| 599 |
+
CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
|
| 600 |
+
#endif /* __CUDA_API_VERSION_INTERNAL */
|
| 601 |
+
|
| 602 |
+
#ifdef __cplusplus
|
| 603 |
+
};
|
| 604 |
+
#endif
|
| 605 |
+
|
| 606 |
+
#undef __CUDA_DEPRECATED
|
| 607 |
+
|
| 608 |
+
#endif
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAGLTYPEDEFS_H
|
| 51 |
+
#define CUDAGLTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
// Dependent includes for cudagl.h
|
| 54 |
+
#include <GL/gl.h>
|
| 55 |
+
|
| 56 |
+
#include <cudaGL.h>
|
| 57 |
+
|
| 58 |
+
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
| 59 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
|
| 60 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
|
| 61 |
+
#else
|
| 62 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
|
| 63 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
#ifdef __cplusplus
|
| 67 |
+
extern "C" {
|
| 68 |
+
#endif // __cplusplus
|
| 69 |
+
|
| 70 |
+
/*
|
| 71 |
+
* Macros for the latest version for each driver function in cudaGL.h
|
| 72 |
+
*/
|
| 73 |
+
#define PFN_cuGraphicsGLRegisterBuffer PFN_cuGraphicsGLRegisterBuffer_v3000
|
| 74 |
+
#define PFN_cuGraphicsGLRegisterImage PFN_cuGraphicsGLRegisterImage_v3000
|
| 75 |
+
#define PFN_cuWGLGetDevice PFN_cuWGLGetDevice_v2020
|
| 76 |
+
#define PFN_cuGLGetDevices PFN_cuGLGetDevices_v6050
|
| 77 |
+
#define PFN_cuGLCtxCreate PFN_cuGLCtxCreate_v3020
|
| 78 |
+
#define PFN_cuGLInit PFN_cuGLInit_v2000
|
| 79 |
+
#define PFN_cuGLRegisterBufferObject PFN_cuGLRegisterBufferObject_v2000
|
| 80 |
+
#define PFN_cuGLMapBufferObject __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
|
| 81 |
+
#define PFN_cuGLUnmapBufferObject PFN_cuGLUnmapBufferObject_v2000
|
| 82 |
+
#define PFN_cuGLUnregisterBufferObject PFN_cuGLUnregisterBufferObject_v2000
|
| 83 |
+
#define PFN_cuGLSetBufferObjectMapFlags PFN_cuGLSetBufferObjectMapFlags_v2030
|
| 84 |
+
#define PFN_cuGLMapBufferObjectAsync __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
|
| 85 |
+
#define PFN_cuGLUnmapBufferObjectAsync PFN_cuGLUnmapBufferObjectAsync_v2030
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
/**
|
| 89 |
+
* Type definitions for functions defined in cudaGL.h
|
| 90 |
+
*/
|
| 91 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
|
| 92 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
|
| 93 |
+
#ifdef _WIN32
|
| 94 |
+
typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
|
| 95 |
+
#endif
|
| 96 |
+
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 97 |
+
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
|
| 98 |
+
typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
|
| 99 |
+
typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
|
| 100 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
|
| 101 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
|
| 102 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
|
| 103 |
+
typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
|
| 104 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 105 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
|
| 106 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
|
| 107 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 108 |
+
|
| 109 |
+
/*
|
| 110 |
+
* Type definitions for older versioned functions in cuda.h
|
| 111 |
+
*/
|
| 112 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 113 |
+
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 114 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
|
| 115 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
|
| 116 |
+
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
|
| 117 |
+
#endif
|
| 118 |
+
|
| 119 |
+
#ifdef __cplusplus
|
| 120 |
+
}
|
| 121 |
+
#endif // __cplusplus
|
| 122 |
+
|
| 123 |
+
#endif // file guard
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaTypedefs.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h
ADDED
|
@@ -0,0 +1,514 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_GL_INTEROP_H__)
|
| 51 |
+
#define __CUDA_GL_INTEROP_H__
|
| 52 |
+
|
| 53 |
+
#include "cuda_runtime_api.h"
|
| 54 |
+
|
| 55 |
+
#if defined(__APPLE__)
|
| 56 |
+
|
| 57 |
+
#include <OpenGL/gl.h>
|
| 58 |
+
|
| 59 |
+
#else /* __APPLE__ */
|
| 60 |
+
|
| 61 |
+
#if defined(__arm__) || defined(__aarch64__)
|
| 62 |
+
#ifndef GL_VERSION
|
| 63 |
+
#error Please include the appropriate gl headers before including cuda_gl_interop.h
|
| 64 |
+
#endif
|
| 65 |
+
#else
|
| 66 |
+
#include <GL/gl.h>
|
| 67 |
+
#endif
|
| 68 |
+
|
| 69 |
+
#endif /* __APPLE__ */
|
| 70 |
+
|
| 71 |
+
/** \cond impl_private */
|
| 72 |
+
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
|
| 73 |
+
#define __CUDA_DEPRECATED
|
| 74 |
+
#elif defined(_MSC_VER)
|
| 75 |
+
#define __CUDA_DEPRECATED __declspec(deprecated)
|
| 76 |
+
#elif defined(__GNUC__)
|
| 77 |
+
#define __CUDA_DEPRECATED __attribute__((deprecated))
|
| 78 |
+
#else
|
| 79 |
+
#define __CUDA_DEPRECATED
|
| 80 |
+
#endif
|
| 81 |
+
/** \endcond impl_private */
|
| 82 |
+
|
| 83 |
+
#if defined(__cplusplus)
|
| 84 |
+
extern "C" {
|
| 85 |
+
#endif /* __cplusplus */
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
* \addtogroup CUDART_OPENGL OpenGL Interoperability
|
| 89 |
+
* This section describes the OpenGL interoperability functions of the CUDA
|
| 90 |
+
* runtime application programming interface. Note that mapping of OpenGL
|
| 91 |
+
* resources is performed with the graphics API agnostic, resource mapping
|
| 92 |
+
* interface described in \ref CUDART_INTEROP "Graphics Interopability".
|
| 93 |
+
*
|
| 94 |
+
* @{
|
| 95 |
+
*/
|
| 96 |
+
|
| 97 |
+
/**
|
| 98 |
+
* CUDA devices corresponding to the current OpenGL context
|
| 99 |
+
*/
|
| 100 |
+
enum cudaGLDeviceList
|
| 101 |
+
{
|
| 102 |
+
cudaGLDeviceListAll = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
|
| 103 |
+
cudaGLDeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
|
| 104 |
+
cudaGLDeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
/**
|
| 108 |
+
* \brief Gets the CUDA devices associated with the current OpenGL context
|
| 109 |
+
*
|
| 110 |
+
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
|
| 111 |
+
* corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
|
| 112 |
+
* at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to
|
| 113 |
+
* the current OpenGL context. If any of the GPUs being used by the current OpenGL
|
| 114 |
+
* context are not CUDA capable then the call will return ::cudaErrorNoDevice.
|
| 115 |
+
*
|
| 116 |
+
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the
|
| 117 |
+
* current OpenGL context
|
| 118 |
+
* \param pCudaDevices - Returned CUDA devices corresponding to the current
|
| 119 |
+
* OpenGL context
|
| 120 |
+
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
|
| 121 |
+
* \param deviceList - The set of devices to return. This set may be
|
| 122 |
+
* ::cudaGLDeviceListAll for all devices,
|
| 123 |
+
* ::cudaGLDeviceListCurrentFrame for the devices used to
|
| 124 |
+
* render the current frame (in SLI), or
|
| 125 |
+
* ::cudaGLDeviceListNextFrame for the devices used to
|
| 126 |
+
* render the next frame (in SLI).
|
| 127 |
+
*
|
| 128 |
+
* \return
|
| 129 |
+
* ::cudaSuccess,
|
| 130 |
+
* ::cudaErrorNoDevice,
|
| 131 |
+
* ::cudaErrorInvalidGraphicsContext,
|
| 132 |
+
* ::cudaErrorOperatingSystem,
|
| 133 |
+
* ::cudaErrorUnknown
|
| 134 |
+
*
|
| 135 |
+
* \note This function is not supported on Mac OS X.
|
| 136 |
+
* \notefnerr
|
| 137 |
+
*
|
| 138 |
+
* \sa
|
| 139 |
+
* ::cudaGraphicsUnregisterResource,
|
| 140 |
+
* ::cudaGraphicsMapResources,
|
| 141 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 142 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 143 |
+
* ::cuGLGetDevices
|
| 144 |
+
*/
|
| 145 |
+
extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
|
| 146 |
+
|
| 147 |
+
/**
|
| 148 |
+
* \brief Register an OpenGL texture or renderbuffer object
|
| 149 |
+
*
|
| 150 |
+
* Registers the texture or renderbuffer object specified by \p image for access by CUDA.
|
| 151 |
+
* A handle to the registered object is returned as \p resource.
|
| 152 |
+
*
|
| 153 |
+
* \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
|
| 154 |
+
* ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
|
| 155 |
+
* or ::GL_RENDERBUFFER.
|
| 156 |
+
*
|
| 157 |
+
* The register flags \p flags specify the intended usage, as follows:
|
| 158 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 159 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 160 |
+
* read from and written to by CUDA. This is the default value.
|
| 161 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 162 |
+
* will not write to this resource.
|
| 163 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 164 |
+
* CUDA will not read from this resource and will write over the
|
| 165 |
+
* entire contents of the resource, so none of the data previously
|
| 166 |
+
* stored in the resource will be preserved.
|
| 167 |
+
* - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
|
| 168 |
+
* bind this resource to a surface reference.
|
| 169 |
+
* - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
|
| 170 |
+
* texture gather operations on this resource.
|
| 171 |
+
*
|
| 172 |
+
* The following image formats are supported. For brevity's sake, the list is abbreviated.
|
| 173 |
+
* For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
|
| 174 |
+
* {GL_R8, GL_R16, GL_RG8, GL_RG16} :
|
| 175 |
+
* - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
|
| 176 |
+
* - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
|
| 177 |
+
* - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
|
| 178 |
+
* {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
|
| 179 |
+
*
|
| 180 |
+
* The following image classes are currently disallowed:
|
| 181 |
+
* - Textures with borders
|
| 182 |
+
* - Multisampled renderbuffers
|
| 183 |
+
*
|
| 184 |
+
* \param resource - Pointer to the returned object handle
|
| 185 |
+
* \param image - name of texture or renderbuffer object to be registered
|
| 186 |
+
* \param target - Identifies the type of object specified by \p image
|
| 187 |
+
* \param flags - Register flags
|
| 188 |
+
*
|
| 189 |
+
* \return
|
| 190 |
+
* ::cudaSuccess,
|
| 191 |
+
* ::cudaErrorInvalidDevice,
|
| 192 |
+
* ::cudaErrorInvalidValue,
|
| 193 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 194 |
+
* ::cudaErrorOperatingSystem,
|
| 195 |
+
* ::cudaErrorUnknown
|
| 196 |
+
* \notefnerr
|
| 197 |
+
*
|
| 198 |
+
* \sa
|
| 199 |
+
* ::cudaGraphicsUnregisterResource,
|
| 200 |
+
* ::cudaGraphicsMapResources,
|
| 201 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 202 |
+
* ::cuGraphicsGLRegisterImage
|
| 203 |
+
*/
|
| 204 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
|
| 205 |
+
|
| 206 |
+
/**
|
| 207 |
+
* \brief Registers an OpenGL buffer object
|
| 208 |
+
*
|
| 209 |
+
* Registers the buffer object specified by \p buffer for access by
|
| 210 |
+
* CUDA. A handle to the registered object is returned as \p
|
| 211 |
+
* resource. The register flags \p flags specify the intended usage,
|
| 212 |
+
* as follows:
|
| 213 |
+
*
|
| 214 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 215 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 216 |
+
* read from and written to by CUDA. This is the default value.
|
| 217 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 218 |
+
* will not write to this resource.
|
| 219 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 220 |
+
* CUDA will not read from this resource and will write over the
|
| 221 |
+
* entire contents of the resource, so none of the data previously
|
| 222 |
+
* stored in the resource will be preserved.
|
| 223 |
+
*
|
| 224 |
+
* \param resource - Pointer to the returned object handle
|
| 225 |
+
* \param buffer - name of buffer object to be registered
|
| 226 |
+
* \param flags - Register flags
|
| 227 |
+
*
|
| 228 |
+
* \return
|
| 229 |
+
* ::cudaSuccess,
|
| 230 |
+
* ::cudaErrorInvalidDevice,
|
| 231 |
+
* ::cudaErrorInvalidValue,
|
| 232 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 233 |
+
* ::cudaErrorOperatingSystem,
|
| 234 |
+
* ::cudaErrorUnknown
|
| 235 |
+
* \notefnerr
|
| 236 |
+
*
|
| 237 |
+
* \sa
|
| 238 |
+
* ::cudaGraphicsUnregisterResource,
|
| 239 |
+
* ::cudaGraphicsMapResources,
|
| 240 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 241 |
+
* ::cuGraphicsGLRegisterBuffer
|
| 242 |
+
*/
|
| 243 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
|
| 244 |
+
|
| 245 |
+
#ifdef _WIN32
|
| 246 |
+
#ifndef WGL_NV_gpu_affinity
|
| 247 |
+
typedef void* HGPUNV;
|
| 248 |
+
#endif
|
| 249 |
+
|
| 250 |
+
/**
|
| 251 |
+
* \brief Gets the CUDA device associated with hGpu
|
| 252 |
+
*
|
| 253 |
+
* Returns the CUDA device associated with a hGpu, if applicable.
|
| 254 |
+
*
|
| 255 |
+
* \param device - Returns the device associated with hGpu, or -1 if hGpu is
|
| 256 |
+
* not a compute device.
|
| 257 |
+
* \param hGpu - Handle to a GPU, as queried via WGL_NV_gpu_affinity
|
| 258 |
+
*
|
| 259 |
+
* \return
|
| 260 |
+
* ::cudaSuccess
|
| 261 |
+
* \notefnerr
|
| 262 |
+
*
|
| 263 |
+
* \sa
|
| 264 |
+
* ::WGL_NV_gpu_affinity,
|
| 265 |
+
* ::cuWGLGetDevice
|
| 266 |
+
*/
|
| 267 |
+
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
|
| 268 |
+
#endif
|
| 269 |
+
|
| 270 |
+
/** @} */ /* END CUDART_OPENGL */
|
| 271 |
+
|
| 272 |
+
/**
|
| 273 |
+
* \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
|
| 274 |
+
* This section describes deprecated OpenGL interoperability functionality.
|
| 275 |
+
*
|
| 276 |
+
* @{
|
| 277 |
+
*/
|
| 278 |
+
|
| 279 |
+
/**
|
| 280 |
+
* CUDA GL Map Flags
|
| 281 |
+
*/
|
| 282 |
+
enum cudaGLMapFlags
|
| 283 |
+
{
|
| 284 |
+
cudaGLMapFlagsNone = 0, /**< Default; Assume resource can be read/written */
|
| 285 |
+
cudaGLMapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
|
| 286 |
+
cudaGLMapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
|
| 287 |
+
};
|
| 288 |
+
|
| 289 |
+
/**
|
| 290 |
+
* \brief Sets a CUDA device to use OpenGL interoperability
|
| 291 |
+
*
|
| 292 |
+
* \deprecated This function is deprecated as of CUDA 5.0.
|
| 293 |
+
*
|
| 294 |
+
* This function is deprecated and should no longer be used. It is
|
| 295 |
+
* no longer necessary to associate a CUDA device with an OpenGL
|
| 296 |
+
* context in order to achieve maximum interoperability performance.
|
| 297 |
+
*
|
| 298 |
+
* This function will immediately initialize the primary context on
|
| 299 |
+
* \p device if needed.
|
| 300 |
+
*
|
| 301 |
+
* \param device - Device to use for OpenGL interoperability
|
| 302 |
+
*
|
| 303 |
+
* \return
|
| 304 |
+
* ::cudaSuccess,
|
| 305 |
+
* ::cudaErrorInvalidDevice,
|
| 306 |
+
* ::cudaErrorSetOnActiveProcess
|
| 307 |
+
* \notefnerr
|
| 308 |
+
*
|
| 309 |
+
* \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
|
| 310 |
+
*/
|
| 311 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
|
| 312 |
+
|
| 313 |
+
/**
|
| 314 |
+
* \brief Registers a buffer object for access by CUDA
|
| 315 |
+
*
|
| 316 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 317 |
+
*
|
| 318 |
+
* Registers the buffer object of ID \p bufObj for access by
|
| 319 |
+
* CUDA. This function must be called before CUDA can map the buffer
|
| 320 |
+
* object. The OpenGL context used to create the buffer, or another
|
| 321 |
+
* context from the same share group, must be bound to the current
|
| 322 |
+
* thread when this is called.
|
| 323 |
+
*
|
| 324 |
+
* \param bufObj - Buffer object ID to register
|
| 325 |
+
*
|
| 326 |
+
* \return
|
| 327 |
+
* ::cudaSuccess,
|
| 328 |
+
* ::cudaErrorInitializationError
|
| 329 |
+
* \notefnerr
|
| 330 |
+
*
|
| 331 |
+
* \sa ::cudaGraphicsGLRegisterBuffer
|
| 332 |
+
*/
|
| 333 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
|
| 334 |
+
|
| 335 |
+
/**
|
| 336 |
+
* \brief Maps a buffer object for access by CUDA
|
| 337 |
+
*
|
| 338 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 339 |
+
*
|
| 340 |
+
* Maps the buffer object of ID \p bufObj into the address space of
|
| 341 |
+
* CUDA and returns in \p *devPtr the base pointer of the resulting
|
| 342 |
+
* mapping. The buffer must have previously been registered by
|
| 343 |
+
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
|
| 344 |
+
* by CUDA, any OpenGL operation which references the buffer will
|
| 345 |
+
* result in undefined behavior. The OpenGL context used to create
|
| 346 |
+
* the buffer, or another context from the same share group, must be
|
| 347 |
+
* bound to the current thread when this is called.
|
| 348 |
+
*
|
| 349 |
+
* All streams in the current thread are synchronized with the current
|
| 350 |
+
* GL context.
|
| 351 |
+
*
|
| 352 |
+
* \param devPtr - Returned device pointer to CUDA object
|
| 353 |
+
* \param bufObj - Buffer object ID to map
|
| 354 |
+
*
|
| 355 |
+
* \return
|
| 356 |
+
* ::cudaSuccess,
|
| 357 |
+
* ::cudaErrorMapBufferObjectFailed
|
| 358 |
+
* \notefnerr
|
| 359 |
+
*
|
| 360 |
+
* \sa ::cudaGraphicsMapResources
|
| 361 |
+
*/
|
| 362 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
|
| 363 |
+
|
| 364 |
+
/**
|
| 365 |
+
* \brief Unmaps a buffer object for access by CUDA
|
| 366 |
+
*
|
| 367 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 368 |
+
*
|
| 369 |
+
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
|
| 370 |
+
* a buffer is unmapped, the base address returned by
|
| 371 |
+
* ::cudaGLMapBufferObject() is invalid and subsequent references to
|
| 372 |
+
* the address result in undefined behavior. The OpenGL context used
|
| 373 |
+
* to create the buffer, or another context from the same share group,
|
| 374 |
+
* must be bound to the current thread when this is called.
|
| 375 |
+
*
|
| 376 |
+
* All streams in the current thread are synchronized with the current
|
| 377 |
+
* GL context.
|
| 378 |
+
*
|
| 379 |
+
* \param bufObj - Buffer object to unmap
|
| 380 |
+
*
|
| 381 |
+
* \return
|
| 382 |
+
* ::cudaSuccess,
|
| 383 |
+
* ::cudaErrorUnmapBufferObjectFailed
|
| 384 |
+
* \notefnerr
|
| 385 |
+
*
|
| 386 |
+
* \sa ::cudaGraphicsUnmapResources
|
| 387 |
+
*/
|
| 388 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
|
| 389 |
+
|
| 390 |
+
/**
|
| 391 |
+
* \brief Unregisters a buffer object for access by CUDA
|
| 392 |
+
*
|
| 393 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 394 |
+
*
|
| 395 |
+
* Unregisters the buffer object of ID \p bufObj for access by CUDA
|
| 396 |
+
* and releases any CUDA resources associated with the buffer. Once a
|
| 397 |
+
* buffer is unregistered, it may no longer be mapped by CUDA. The GL
|
| 398 |
+
* context used to create the buffer, or another context from the
|
| 399 |
+
* same share group, must be bound to the current thread when this is
|
| 400 |
+
* called.
|
| 401 |
+
*
|
| 402 |
+
* \param bufObj - Buffer object to unregister
|
| 403 |
+
*
|
| 404 |
+
* \return
|
| 405 |
+
* ::cudaSuccess
|
| 406 |
+
* \notefnerr
|
| 407 |
+
*
|
| 408 |
+
* \sa ::cudaGraphicsUnregisterResource
|
| 409 |
+
*/
|
| 410 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
|
| 411 |
+
|
| 412 |
+
/**
|
| 413 |
+
* \brief Set usage flags for mapping an OpenGL buffer
|
| 414 |
+
*
|
| 415 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 416 |
+
*
|
| 417 |
+
* Set flags for mapping the OpenGL buffer \p bufObj
|
| 418 |
+
*
|
| 419 |
+
* Changes to flags will take effect the next time \p bufObj is mapped.
|
| 420 |
+
* The \p flags argument may be any of the following:
|
| 421 |
+
*
|
| 422 |
+
* - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
|
| 423 |
+
* be used. It is therefore assumed that this buffer will be read from and
|
| 424 |
+
* written to by CUDA kernels. This is the default value.
|
| 425 |
+
* - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
|
| 426 |
+
* buffer will not write to the buffer.
|
| 427 |
+
* - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
|
| 428 |
+
* this buffer will not read from the buffer and will write over the
|
| 429 |
+
* entire contents of the buffer, so none of the data previously stored in
|
| 430 |
+
* the buffer will be preserved.
|
| 431 |
+
*
|
| 432 |
+
* If \p bufObj has not been registered for use with CUDA, then
|
| 433 |
+
* ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
|
| 434 |
+
* mapped for access by CUDA, then ::cudaErrorUnknown is returned.
|
| 435 |
+
*
|
| 436 |
+
* \param bufObj - Registered buffer object to set flags for
|
| 437 |
+
* \param flags - Parameters for buffer mapping
|
| 438 |
+
*
|
| 439 |
+
* \return
|
| 440 |
+
* ::cudaSuccess,
|
| 441 |
+
* ::cudaErrorInvalidValue,
|
| 442 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 443 |
+
* ::cudaErrorUnknown
|
| 444 |
+
* \notefnerr
|
| 445 |
+
*
|
| 446 |
+
* \sa ::cudaGraphicsResourceSetMapFlags
|
| 447 |
+
*/
|
| 448 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
|
| 449 |
+
|
| 450 |
+
/**
|
| 451 |
+
* \brief Maps a buffer object for access by CUDA
|
| 452 |
+
*
|
| 453 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 454 |
+
*
|
| 455 |
+
* Maps the buffer object of ID \p bufObj into the address space of
|
| 456 |
+
* CUDA and returns in \p *devPtr the base pointer of the resulting
|
| 457 |
+
* mapping. The buffer must have previously been registered by
|
| 458 |
+
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
|
| 459 |
+
* by CUDA, any OpenGL operation which references the buffer will
|
| 460 |
+
* result in undefined behavior. The OpenGL context used to create
|
| 461 |
+
* the buffer, or another context from the same share group, must be
|
| 462 |
+
* bound to the current thread when this is called.
|
| 463 |
+
*
|
| 464 |
+
* Stream /p stream is synchronized with the current GL context.
|
| 465 |
+
*
|
| 466 |
+
* \param devPtr - Returned device pointer to CUDA object
|
| 467 |
+
* \param bufObj - Buffer object ID to map
|
| 468 |
+
* \param stream - Stream to synchronize
|
| 469 |
+
*
|
| 470 |
+
* \return
|
| 471 |
+
* ::cudaSuccess,
|
| 472 |
+
* ::cudaErrorMapBufferObjectFailed
|
| 473 |
+
* \notefnerr
|
| 474 |
+
*
|
| 475 |
+
* \sa ::cudaGraphicsMapResources
|
| 476 |
+
*/
|
| 477 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
|
| 478 |
+
|
| 479 |
+
/**
|
| 480 |
+
* \brief Unmaps a buffer object for access by CUDA
|
| 481 |
+
*
|
| 482 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 483 |
+
*
|
| 484 |
+
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
|
| 485 |
+
* a buffer is unmapped, the base address returned by
|
| 486 |
+
* ::cudaGLMapBufferObject() is invalid and subsequent references to
|
| 487 |
+
* the address result in undefined behavior. The OpenGL context used
|
| 488 |
+
* to create the buffer, or another context from the same share group,
|
| 489 |
+
* must be bound to the current thread when this is called.
|
| 490 |
+
*
|
| 491 |
+
* Stream /p stream is synchronized with the current GL context.
|
| 492 |
+
*
|
| 493 |
+
* \param bufObj - Buffer object to unmap
|
| 494 |
+
* \param stream - Stream to synchronize
|
| 495 |
+
*
|
| 496 |
+
* \return
|
| 497 |
+
* ::cudaSuccess,
|
| 498 |
+
* ::cudaErrorUnmapBufferObjectFailed
|
| 499 |
+
* \notefnerr
|
| 500 |
+
*
|
| 501 |
+
* \sa ::cudaGraphicsUnmapResources
|
| 502 |
+
*/
|
| 503 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
|
| 504 |
+
|
| 505 |
+
/** @} */ /* END CUDART_OPENGL_DEPRECATED */
|
| 506 |
+
|
| 507 |
+
#if defined(__cplusplus)
|
| 508 |
+
}
|
| 509 |
+
#endif /* __cplusplus */
|
| 510 |
+
|
| 511 |
+
#undef __CUDA_DEPRECATED
|
| 512 |
+
|
| 513 |
+
#endif /* __CUDA_GL_INTEROP_H__ */
|
| 514 |
+
|
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|