koichi12 commited on
Commit
f64ba55
·
verified ·
1 Parent(s): a8c7c5f

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py +262 -0
  2. .venv/lib/python3.11/site-packages/triton/backends/amd/driver.c +211 -0
  3. .venv/lib/python3.11/site-packages/triton/backends/amd/driver.py +497 -0
  4. .venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h +75 -0
  5. .venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h +194 -0
  6. .venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h +98 -0
  10. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h +100 -0
  11. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h +1083 -0
  12. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h +588 -0
  13. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h +1730 -0
  14. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h +452 -0
  15. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h +95 -0
  16. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h +212 -0
  17. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h +693 -0
  18. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h +135 -0
  19. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h +419 -0
  20. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h +320 -0
  21. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h +62 -0
  22. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h +63 -0
  23. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h +63 -0
  24. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/common_functions.h +310 -0
  25. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h +64 -0
  26. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h +1192 -0
  27. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp +197 -0
  28. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.h +0 -0
  29. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp +1197 -0
  30. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/func_macro.h +57 -0
  31. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_config.h +310 -0
  32. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_defines.h +280 -0
  33. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_runtime.h +306 -0
  34. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.h +0 -0
  35. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp +0 -0
  36. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.h +754 -0
  37. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.hpp +1128 -0
  38. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/nvfunctional +621 -0
  39. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h +139 -0
  40. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp +192 -0
  41. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h +164 -0
  42. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp +148 -0
  43. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h +282 -0
  44. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp +248 -0
  45. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/storage_class.h +142 -0
  46. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGL.h +608 -0
  47. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h +123 -0
  48. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaTypedefs.h +0 -0
  49. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h +514 -0
  50. .venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h +0 -0
.venv/lib/python3.11/site-packages/triton/backends/amd/compiler.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from triton.backends.compiler import BaseBackend, GPUTarget
2
+ from triton._C.libtriton import ir, passes, llvm, amd
3
+ from dataclasses import dataclass
4
+ from typing import Any, Tuple
5
+ import hashlib
6
+ import tempfile
7
+ import os
8
+ import re
9
+ import subprocess
10
+ import functools
11
+ from pathlib import Path
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class HIPOptions:
16
+ num_warps: int = 4
17
+ waves_per_eu: int = 1
18
+ num_stages: int = 0
19
+ num_ctas: int = 1
20
+ extern_libs: dict = None
21
+ cluster_dims: tuple = (1, 1, 1)
22
+ debug: bool = False
23
+ arch: str = None
24
+ allow_fp8e4nv: bool = False
25
+ allow_fp8e4b15: bool = False
26
+ default_dot_input_precision: str = "ieee"
27
+ allowed_dot_input_precisions: Tuple[str] = ("ieee", )
28
+ enable_fp_fusion: bool = True
29
+ matrix_instr_nonkdim: int = 0
30
+ kpack: int = 1
31
+ allow_flush_denorm: bool = False
32
+ max_num_imprecise_acc_default: int = 0
33
+ backend_name: str = 'hip'
34
+
35
+ def __post_init__(self):
36
+ default_libdir = Path(__file__).parent / 'lib'
37
+ extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
38
+ # Ignore user-defined warp size for gfx9
39
+ warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch else 64
40
+ object.__setattr__(self, 'warp_size', warp_size)
41
+ libs = ["ocml", "ockl"]
42
+ for lib in libs:
43
+ extern_libs[lib] = str(default_libdir / f'{lib}.bc')
44
+ object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
45
+ assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
46
+ "num_warps must be a power of 2"
47
+
48
+ def hash(self):
49
+ key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
50
+ return hashlib.sha256(key.encode("utf-8")).hexdigest()
51
+
52
+
53
+ class HIPBackend(BaseBackend):
54
+
55
+ @staticmethod
56
+ def supports_target(target: GPUTarget):
57
+ return target.backend == 'hip'
58
+
59
+ def __init__(self, target: GPUTarget) -> None:
60
+ super().__init__(target)
61
+ assert isinstance(target.arch, str)
62
+ self.binary_ext = "hsaco"
63
+
64
+ def parse_options(self, opts) -> Any:
65
+ args = {'arch': self.target.arch}
66
+ args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts})
67
+ return HIPOptions(**args)
68
+
69
+ def pack_metadata(self, metadata):
70
+ return (
71
+ metadata.num_warps,
72
+ metadata.num_ctas,
73
+ metadata.shared,
74
+ metadata.cluster_dims[0],
75
+ metadata.cluster_dims[1],
76
+ metadata.cluster_dims[2],
77
+ )
78
+
79
+ def get_codegen_implementation(self):
80
+ codegen_fns = dict()
81
+ return codegen_fns
82
+
83
+ def load_dialects(self, ctx):
84
+ amd.load_dialects(ctx)
85
+
86
+ @staticmethod
87
+ def path_to_rocm_lld():
88
+ # Check env path for ld.lld
89
+ lld_env_path = os.getenv("TRITON_HIP_LLD_PATH")
90
+ if lld_env_path is not None:
91
+ lld = Path(lld_env_path)
92
+ if lld.is_file():
93
+ return lld
94
+ # Check backend for ld.lld (used for pytorch wheels)
95
+ lld = Path(__file__).parent / "llvm/bin/ld.lld"
96
+ if lld.is_file():
97
+ return lld
98
+ lld = Path("/opt/rocm/llvm/bin/ld.lld")
99
+ if lld.is_file():
100
+ return lld
101
+ lld = Path("/usr/bin/ld.lld")
102
+ if lld.is_file():
103
+ return lld
104
+ raise Exception("ROCm linker /opt/rocm/llvm/bin/ld.lld not found")
105
+
106
+ @staticmethod
107
+ def make_ttir(mod, metadata, options):
108
+ pm = ir.pass_manager(mod.context)
109
+ pm.enable_debug()
110
+ passes.common.add_inliner(pm)
111
+ passes.ttir.add_rewrite_tensor_pointer(pm)
112
+ passes.ttir.add_combine(pm)
113
+ passes.common.add_canonicalizer(pm)
114
+ passes.ttir.add_reorder_broadcast(pm)
115
+ passes.common.add_cse(pm)
116
+ passes.common.add_licm(pm)
117
+ passes.common.add_symbol_dce(pm)
118
+ pm.run(mod)
119
+ return mod
120
+
121
+ @staticmethod
122
+ def make_ttgir(mod, metadata, options):
123
+ pm = ir.pass_manager(mod.context)
124
+ pm.enable_debug()
125
+ passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
126
+ options.num_ctas)
127
+ pm.run(mod)
128
+ pm = ir.pass_manager(mod.context)
129
+ pm.enable_debug()
130
+ passes.ttgpuir.add_coalesce(pm)
131
+ passes.ttgpuir.add_remove_layout_conversions(pm)
132
+ passes.ttgpuir.add_optimize_thread_locality(pm)
133
+ amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
134
+ passes.ttgpuir.add_remove_layout_conversions(pm)
135
+ amd.passes.ttgpuir.add_optimize_epilogue(pm)
136
+ passes.ttgpuir.add_optimize_dot_operands(pm, True)
137
+ if options.num_stages == 0 and amd.has_matrix_core_feature(options.arch):
138
+ amd.passes.ttgpuir.add_stream_pipeline(pm)
139
+ passes.common.add_canonicalizer(pm)
140
+ passes.ttgpuir.add_optimize_dot_operands(pm, True)
141
+ passes.ttgpuir.add_remove_layout_conversions(pm)
142
+ passes.ttgpuir.add_reduce_data_duplication(pm)
143
+ if options.num_stages != 0:
144
+ amd.passes.ttgpuir.add_reorder_instructions(pm)
145
+ passes.common.add_cse(pm)
146
+ passes.common.add_symbol_dce(pm)
147
+ pm.run(mod)
148
+ return mod
149
+
150
+ @staticmethod
151
+ def make_llir(src, metadata, options):
152
+ mod = src
153
+ # TritonGPU -> LLVM-IR (MLIR)
154
+ pm = ir.pass_manager(mod.context)
155
+ pm.enable_debug()
156
+ amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch)
157
+ passes.convert.add_scf_to_cf(pm)
158
+ passes.convert.add_index_to_llvmir(pm)
159
+
160
+ passes.ttgpuir.add_allocate_shared_memory(pm)
161
+ ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
162
+ ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
163
+ ## of the value of kernel arg `allow_flush_denorm`.
164
+ ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
165
+ ## depends on the value of kernel arg `allow_flush_denorm`.
166
+ ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
167
+ ## For now it is used as a controller for developers only.
168
+ __HIP_FTZ = True
169
+ amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
170
+ passes.common.add_canonicalizer(pm)
171
+ passes.common.add_cse(pm)
172
+
173
+ passes.convert.add_cf_to_llvmir(pm)
174
+ passes.convert.add_arith_to_llvmir(pm)
175
+ passes.common.add_canonicalizer(pm)
176
+ passes.common.add_cse(pm)
177
+ passes.common.add_symbol_dce(pm)
178
+ if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
179
+ passes.llvmir.add_di_scope(pm)
180
+ # This pass (`add_builtin_func_to_llvmir`) serves as a temporary workaround to address the issue of excessive basic block
181
+ # count caused by predicated loads/stores. In certain kernels, the addition of these blocks can cause the MLIR
182
+ # canonicalizer to never finish when attempting to merge blocks. The permanent solution under consideration
183
+ # involves using MUBUF instructions that have built-in out-of-bounds checks, which would eliminate the need
184
+ # for conditional branching around memory accesses.
185
+ amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm)
186
+ pm.run(mod)
187
+
188
+ # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
189
+ llvm.init_targets()
190
+ context = llvm.context()
191
+ llvm_mod = llvm.to_module(mod, context)
192
+
193
+ # Set various control constants on the LLVM module so that device
194
+ # libraries can resolve references to them.
195
+ amd.set_isa_version(llvm_mod, options.arch)
196
+ amd.set_abi_version(llvm_mod, 400)
197
+ amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
198
+ amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
199
+ amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
200
+ amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
201
+
202
+ # Set kernel attributes first given this may affect later optimizations.
203
+ fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
204
+ # The public kernel should be kernel 0.
205
+ fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
206
+ fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
207
+ fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}")
208
+ denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
209
+ fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
210
+
211
+ if options.extern_libs:
212
+ paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
213
+ llvm.link_extern_libs(llvm_mod, paths)
214
+
215
+ llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, amd.TARGET_TRIPLE)
216
+
217
+ # Get some metadata
218
+ metadata["shared"] = src.get_int_attr("triton_gpu.shared")
219
+
220
+ amd.cleanup_bitcode_metadata(llvm_mod)
221
+ return str(llvm_mod)
222
+
223
+ @staticmethod
224
+ def make_amdgcn(src, metadata, options):
225
+ # Find kernel names (there should only be one)
226
+ # We get the name at the last possible step to accomodate `triton.compile`
227
+ # on user-provided LLVM
228
+ names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
229
+ assert len(names) == 1
230
+ metadata["name"] = names[0]
231
+ # llvm -> hsaco
232
+ amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', [], options.enable_fp_fusion, False)
233
+ if os.environ.get("AMDGCN_ENABLE_DUMP", "0") == "1":
234
+ print("// -----// AMDGCN Dump //----- //")
235
+ print(amdgcn)
236
+ return amdgcn
237
+
238
+ @staticmethod
239
+ def make_hsaco(src, metadata, options):
240
+ hsaco = amd.assemble_amdgcn(src, options.arch, '')
241
+
242
+ rocm_path = HIPBackend.path_to_rocm_lld()
243
+ with tempfile.NamedTemporaryFile() as tmp_out:
244
+ with tempfile.NamedTemporaryFile() as tmp_in:
245
+ with open(tmp_in.name, 'wb') as fd_in:
246
+ fd_in.write(hsaco)
247
+ subprocess.check_call([rocm_path, '-flavor', 'gnu', '-shared', tmp_in.name, '-o', tmp_out.name])
248
+ with open(tmp_out.name, 'rb') as fd_out:
249
+ ret = fd_out.read()
250
+ return ret
251
+
252
+ def add_stages(self, stages, options):
253
+ stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
254
+ stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
255
+ stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
256
+ stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
257
+ stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
258
+
259
+ @functools.lru_cache()
260
+ def hash(self):
261
+ version = subprocess.check_output([HIPBackend.path_to_rocm_lld(), "--version"], encoding='utf-8')
262
+ return f'{version}-{self.target}'
.venv/lib/python3.11/site-packages/triton/backends/amd/driver.c ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #define __HIP_PLATFORM_AMD__
2
+ // clang-format off
3
+ // hip_depreated.h needs definitions from hip_runtime.h.
4
+ #include <hip/hip_runtime.h>
5
+ #include <hip/hip_deprecated.h>
6
+ // clang-format on
7
+ #define PY_SSIZE_T_CLEAN
8
+ #include <Python.h>
9
+ #include <dlfcn.h>
10
+ #include <stdio.h>
11
+ #include <stdlib.h>
12
+
13
+ // The list of paths to search for the HIP runtime library. The caller Python
14
+ // code should substitute the search path placeholder.
15
+ static const char *hipLibSearchPaths[] = {"/*py_libhip_search_path*/"};
16
+
17
+ // The list of HIP dynamic library symbols and their signature we are interested
18
+ // in this file.
19
+ // |FOR_EACH_ERR_FN| is a macro to process APIs that return hipError_t;
20
+ // |FOR_EACH_STR_FN| is a macro to process APIs that return const char *.
21
+ //
22
+ // HIP 6.0 introduced an updated hipGetDeviceProperties API under a new symbol,
23
+ // hipGetDevicePropertiesR0600. However, the associated hipDeviceProp_t was
24
+ // directly updated with breaking changes to match hipGetDevicePropertiesR0600
25
+ // in the header file. We include the header file from HIP 6.0. So here if we
26
+ // use hipGetDeviceProperties together with hipDeviceProp_t we will use the
27
+ // old API with a new struct definition and mess up the interpretation.
28
+ //
29
+ // This is a known issue: https://github.com/ROCm/ROCm/issues/2728.
30
+ //
31
+ // For now explicitly defer to the old hipDeviceProp_t struct. This should work
32
+ // for both 5.x and 6.x. In the long term we need to switch to use
33
+ // hipGetProcAddress once available:
34
+ // https://github.com/ROCm/clr/commit/0479cdb3dd30ef58718cad44e424bd793c394cc0
35
+ #define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN) \
36
+ FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError) \
37
+ FOR_EACH_ERR_FN(hipGetDeviceProperties, hipDeviceProp_tR0000 *prop, \
38
+ int deviceId) \
39
+ FOR_EACH_ERR_FN(hipModuleLoadDataEx, hipModule_t *module, const void *image, \
40
+ unsigned int numOptions, hipJitOption *options, \
41
+ void **optionValues) \
42
+ FOR_EACH_ERR_FN(hipModuleGetFunction, hipFunction_t *function, \
43
+ hipModule_t module, const char *kname) \
44
+ FOR_EACH_ERR_FN(hipFuncGetAttribute, int *, hipFunction_attribute attr, \
45
+ hipFunction_t function)
46
+
47
+ // The HIP symbol table for holding resolved dynamic library symbols.
48
+ struct HIPSymbolTable {
49
+ #define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...) \
50
+ hipError_t (*hipSymbolName)(__VA_ARGS__);
51
+ #define DEFINE_EACH_STR_FIELD(hipSymbolName, ...) \
52
+ const char *(*hipSymbolName)(__VA_ARGS__);
53
+
54
+ HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
55
+ };
56
+
57
+ static struct HIPSymbolTable hipSymbolTable;
58
+
59
+ bool initSymbolTable() {
60
+ // Use the HIP runtime library loaded into the existing process if it exits.
61
+ void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
62
+ if (lib) {
63
+ // printf("[triton] chosen loaded libamdhip64.so in the process\n");
64
+ }
65
+
66
+ // Otherwise, go through the list of search paths to dlopen the first HIP
67
+ // driver library.
68
+ if (!lib) {
69
+ int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
70
+ for (int i = 0; i < n; ++i) {
71
+ void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
72
+ if (handle) {
73
+ lib = handle;
74
+ // printf("[triton] chosen %s\n", hipLibSearchPaths[i]);
75
+ }
76
+ }
77
+ }
78
+ if (!lib) {
79
+ PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
80
+ return false;
81
+ }
82
+
83
+ // Resolve all symbols we are interested in.
84
+ dlerror(); // Clear existing errors
85
+ const char *error = NULL;
86
+ #define QUERY_EACH_FN(hipSymbolName, ...) \
87
+ *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName); \
88
+ error = dlerror(); \
89
+ if (error) { \
90
+ PyErr_SetString(PyExc_RuntimeError, \
91
+ "cannot query " #hipSymbolName " from libamdhip64.so"); \
92
+ dlclose(lib); \
93
+ return false; \
94
+ }
95
+
96
+ HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
97
+
98
+ return true;
99
+ }
100
+
101
+ static inline void gpuAssert(hipError_t code, const char *file, int line) {
102
+ {
103
+ if (code != HIP_SUCCESS) {
104
+ {
105
+ const char *prefix = "Triton Error [HIP]: ";
106
+ const char *str = hipSymbolTable.hipGetErrorString(code);
107
+ char err[1024] = {0};
108
+ snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str);
109
+ PyGILState_STATE gil_state;
110
+ gil_state = PyGILState_Ensure();
111
+ PyErr_SetString(PyExc_RuntimeError, err);
112
+ PyGILState_Release(gil_state);
113
+ }
114
+ }
115
+ }
116
+ }
117
+
118
+ #define HIP_CHECK(ans) \
119
+ { \
120
+ gpuAssert((ans), __FILE__, __LINE__); \
121
+ if (PyErr_Occurred()) \
122
+ return NULL; \
123
+ }
124
+
125
+ static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
126
+ int device_id;
127
+ if (!PyArg_ParseTuple(args, "i", &device_id))
128
+ return NULL;
129
+
130
+ hipDeviceProp_tR0000 props;
131
+ HIP_CHECK(hipSymbolTable.hipGetDeviceProperties(&props, device_id));
132
+
133
+ // create a struct to hold device properties
134
+ return Py_BuildValue(
135
+ "{s:i, s:i, s:i, s:i, s:i, s:i, s:s, s:i}", "max_shared_mem",
136
+ props.sharedMemPerBlock, "max_num_regs", props.regsPerBlock,
137
+ "multiprocessor_count", props.multiProcessorCount, "sm_clock_rate",
138
+ props.clockRate, "mem_clock_rate", props.memoryClockRate, "mem_bus_width",
139
+ props.memoryBusWidth, "arch", props.gcnArchName, "warpSize",
140
+ props.warpSize);
141
+ }
142
+
143
+ static PyObject *loadBinary(PyObject *self, PyObject *args) {
144
+ const char *name;
145
+ const char *data;
146
+ Py_ssize_t data_size;
147
+ int shared;
148
+ int device;
149
+ if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
150
+ &device)) {
151
+ return NULL;
152
+ }
153
+
154
+ // set HIP options
155
+ hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
156
+ hipJitOptionErrorLogBuffer,
157
+ hipJitOptionInfoLogBufferSizeBytes,
158
+ hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
159
+ const unsigned int errbufsize = 8192;
160
+ const unsigned int logbufsize = 8192;
161
+ char _err[errbufsize];
162
+ char _log[logbufsize];
163
+ void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
164
+ (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
165
+
166
+ // launch HIP Binary
167
+ hipModule_t mod;
168
+ hipFunction_t fun;
169
+ HIP_CHECK(hipSymbolTable.hipModuleLoadDataEx(&mod, data, 5, opt, optval))
170
+ HIP_CHECK(hipSymbolTable.hipModuleGetFunction(&fun, mod, name));
171
+
172
+ // get allocated registers and spilled registers from the function
173
+ int n_regs = 0;
174
+ int n_spills = 0;
175
+ hipSymbolTable.hipFuncGetAttribute(&n_regs, HIP_FUNC_ATTRIBUTE_NUM_REGS, fun);
176
+ hipSymbolTable.hipFuncGetAttribute(&n_spills,
177
+ HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
178
+ n_spills /= 4;
179
+ if (PyErr_Occurred()) {
180
+ return NULL;
181
+ }
182
+ return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs,
183
+ n_spills);
184
+ }
185
+
186
+ static PyMethodDef ModuleMethods[] = {
187
+ {"load_binary", loadBinary, METH_VARARGS,
188
+ "Load provided hsaco into HIP driver"},
189
+ {"get_device_properties", getDeviceProperties, METH_VARARGS,
190
+ "Get the properties for a given device"},
191
+ {NULL, NULL, 0, NULL} // sentinel
192
+ };
193
+
194
+ static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils",
195
+ NULL, // documentation
196
+ -1, // size
197
+ ModuleMethods};
198
+
199
+ PyMODINIT_FUNC PyInit_hip_utils(void) {
200
+ if (!initSymbolTable()) {
201
+ return NULL;
202
+ }
203
+
204
+ PyObject *m = PyModule_Create(&ModuleDef);
205
+ if (m == NULL) {
206
+ return NULL;
207
+ }
208
+ PyModule_AddFunctions(m, ModuleMethods);
209
+
210
+ return m;
211
+ }
.venv/lib/python3.11/site-packages/triton/backends/amd/driver.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import os
3
+ import hashlib
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+ from triton.runtime.build import _build
8
+ from triton.runtime.cache import get_cache_manager
9
+ from triton.backends.compiler import GPUTarget
10
+ from triton.backends.driver import GPUDriver
11
+
12
+ dirname = os.path.dirname(os.path.realpath(__file__))
13
+ include_dir = [os.path.join(dirname, "include")]
14
+
15
+
16
+ def _find_already_mmapped_dylib_on_linux(lib_name):
17
+ import platform
18
+ if platform.system() != 'Linux':
19
+ return None
20
+
21
+ # Use dl_iterate_phdr to walk through the list of shared libraries at runtime.
22
+ # See https://www.man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html for details.
23
+
24
+ import ctypes
25
+ from ctypes import c_char, c_int, c_size_t, c_void_p, c_char_p, POINTER
26
+
27
+ class DlPhdrInfo(ctypes.Structure):
28
+ _fields_ = [
29
+ ('dlpi_addr', c_void_p),
30
+ ('dlpi_name', c_char_p),
31
+ # We don't care about the remaining fields.
32
+ ]
33
+
34
+ # callback_t must use POINTER(c_char) to avoid copying.
35
+ callback_t = ctypes.CFUNCTYPE(c_int, POINTER(DlPhdrInfo), POINTER(c_size_t), POINTER(c_char))
36
+
37
+ # Load libc and get the dl_iterate_phdr symbol.
38
+ try:
39
+ dl_iterate_phdr = ctypes.CDLL('libc.so.6').dl_iterate_phdr
40
+ except:
41
+ return None
42
+ # argtypes must use c_char_p to accept create_string_buffer.
43
+ dl_iterate_phdr.argtypes = [callback_t, c_char_p]
44
+ dl_iterate_phdr.restype = c_int
45
+
46
+ max_path_length = 4096
47
+ path = ctypes.create_string_buffer(max_path_length + 1)
48
+
49
+ # Define callback to get the loaded dylib path.
50
+ def callback(info, size, data):
51
+ dlpi_name = info.contents.dlpi_name
52
+ p = Path(os.fsdecode(dlpi_name))
53
+ if lib_name in p.name:
54
+ # Found the dylib; get its path.
55
+ ctypes.memmove(data, dlpi_name, min(max_path_length, len(dlpi_name)))
56
+ return 1
57
+ return 0
58
+
59
+ if dl_iterate_phdr(callback_t(callback), path):
60
+ return os.fsdecode(ctypes.string_at(path))
61
+ return None
62
+
63
+
64
+ @functools.lru_cache()
65
+ def _get_path_to_hip_runtime_dylib():
66
+ lib_name = "libamdhip64.so"
67
+
68
+ # If we are told explicitly what HIP runtime dynamic library to use, obey that.
69
+ env_libhip_path = os.getenv("TRITON_LIBHIP_PATH")
70
+ if env_libhip_path:
71
+ if env_libhip_path.endswith(lib_name) and os.path.exists(env_libhip_path):
72
+ return env_libhip_path
73
+ raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}")
74
+
75
+ # If the shared object is already mmapped to address space, use it.
76
+ mmapped_path = _find_already_mmapped_dylib_on_linux(lib_name)
77
+ if mmapped_path:
78
+ if os.path.exists(mmapped_path):
79
+ return mmapped_path
80
+ raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}")
81
+
82
+ paths = []
83
+
84
+ import site
85
+ # First search the HIP runtime dynamic library packaged with PyTorch. It's very likely
86
+ # that we run Triton together with PyTorch. This makes sure we use the same dynamic
87
+ # library to avoid version mismatch.
88
+ site_packages = site.getsitepackages()
89
+ user_site = site.getusersitepackages()
90
+ if site.ENABLE_USER_SITE: # ENABLE_USER_SITE is initialized in getusersitepackages()
91
+ site_packages = [user_site] + site_packages
92
+ for path in site_packages:
93
+ path = os.path.join(path, "torch", "lib", lib_name)
94
+ if os.path.exists(path):
95
+ return path
96
+ paths.append(path)
97
+
98
+ # Then try to see if developer provides a HIP runtime dynamic library using LD_LIBARAY_PATH.
99
+ env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
100
+ if env_ld_library_path:
101
+ for d in env_ld_library_path.split(":"):
102
+ f = os.path.join(d, lib_name)
103
+ if os.path.exists(f):
104
+ return f
105
+ paths.append(f)
106
+
107
+ # Afterwards try to search the loader dynamic library resolution paths.
108
+ libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
109
+ # each line looks like the following:
110
+ # libamdhip64.so.6 (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so.6
111
+ # libamdhip64.so (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so
112
+ locs = [line.split()[-1] for line in libs.splitlines() if line.strip().endswith(lib_name)]
113
+ for loc in locs:
114
+ if os.path.exists(loc):
115
+ return loc
116
+ paths.append(loc)
117
+
118
+ # As a last resort, guess if we have it in some common installation path.
119
+ common_install_path = os.path.join('/opt/rocm/lib/', lib_name)
120
+ if os.path.exists(common_install_path):
121
+ return common_install_path
122
+ paths.append(common_install_path)
123
+
124
+ raise RuntimeError(f"cannot locate {lib_name} after attempted paths {paths}")
125
+
126
+
127
+ def compile_module_from_src(src, name):
128
+ key = hashlib.sha256(src.encode("utf-8")).hexdigest()
129
+ cache = get_cache_manager(key)
130
+ cache_path = cache.get_file(f"{name}.so")
131
+ if cache_path is None:
132
+ with tempfile.TemporaryDirectory() as tmpdir:
133
+ src_path = os.path.join(tmpdir, "main.c")
134
+ with open(src_path, "w") as f:
135
+ f.write(src)
136
+ so = _build(name, src_path, tmpdir, [], include_dir, [])
137
+ with open(so, "rb") as f:
138
+ cache_path = cache.put(f.read(), f"{name}.so", binary=True)
139
+ import importlib.util
140
+ spec = importlib.util.spec_from_file_location(name, cache_path)
141
+ mod = importlib.util.module_from_spec(spec)
142
+ spec.loader.exec_module(mod)
143
+ return mod
144
+
145
+
146
+ class HIPUtils(object):
147
+
148
+ def __new__(cls):
149
+ if not hasattr(cls, "instance"):
150
+ cls.instance = super(HIPUtils, cls).__new__(cls)
151
+ return cls.instance
152
+
153
+ def __init__(self):
154
+ libhip_path = _get_path_to_hip_runtime_dylib()
155
+ src = Path(os.path.join(dirname, "driver.c")).read_text()
156
+ # Just do a simple search and replace here instead of templates or format strings.
157
+ # This way we don't need to escape-quote C code curly brackets and we can replace
158
+ # exactly once.
159
+ src = src.replace('/*py_libhip_search_path*/', libhip_path, 1)
160
+ mod = compile_module_from_src(src, "hip_utils")
161
+ self.load_binary = mod.load_binary
162
+ self.get_device_properties = mod.get_device_properties
163
+
164
+
165
+ # -------------------- Launcher ----------------------------
166
+ def ty_to_cpp(ty):
167
+ if ty[0] == '*':
168
+ return "hipDeviceptr_t"
169
+ return {
170
+ "i1": "int32_t",
171
+ "i8": "int8_t",
172
+ "i16": "int16_t",
173
+ "i32": "int32_t",
174
+ "i64": "int64_t",
175
+ "u1": "uint32_t",
176
+ "u8": "uint8_t",
177
+ "u16": "uint16_t",
178
+ "u32": "uint32_t",
179
+ "u64": "uint64_t",
180
+ "fp16": "float",
181
+ "bf16": "float",
182
+ "fp32": "float",
183
+ "f32": "float",
184
+ "fp64": "double",
185
+ }[ty]
186
+
187
+
188
+ def make_launcher(constants, signature, ids, warp_size):
189
+ start_desc = len(signature)
190
+ #signature = generate_cu_signature(constants, signature, ids)
191
+ arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
192
+
193
+ def _extracted_type(ty):
194
+ if ty[0] == '*':
195
+ return "PyObject*"
196
+ return {
197
+ 'i1': 'int32_t',
198
+ 'i8': 'int8_t',
199
+ 'i16': 'int16_t',
200
+ 'i32': 'int32_t',
201
+ 'i64': 'int64_t',
202
+ 'u1': 'uint32_t',
203
+ 'u8': 'uint8_t',
204
+ 'u16': 'uint16_t',
205
+ 'u32': 'uint32_t',
206
+ 'u64': 'uint64_t',
207
+ 'fp16': 'float',
208
+ 'bf16': 'float',
209
+ 'fp32': 'float',
210
+ 'f32': 'float',
211
+ 'fp64': 'double',
212
+ }[ty]
213
+
214
+ def format_of(ty):
215
+ return {
216
+ "PyObject*": "O",
217
+ "float": "f",
218
+ "double": "d",
219
+ "long": "l",
220
+ "int8_t": "b",
221
+ "int16_t": "h",
222
+ "int32_t": "i",
223
+ "int64_t": "l",
224
+ "uint8_t": "B",
225
+ "uint16_t": "H",
226
+ "uint32_t": "I",
227
+ "uint64_t": "K",
228
+ }[ty]
229
+
230
+ args_format = ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])
231
+ format = "iiiKKOOOO" + args_format
232
+ args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
233
+
234
+ libhip_path = _get_path_to_hip_runtime_dylib()
235
+
236
+ # generate glue code
237
+ params = [i for i in signature.keys() if i not in constants]
238
+ src = f"""
239
+ #define __HIP_PLATFORM_AMD__
240
+ #include <hip/hip_runtime.h>
241
+ #include <Python.h>
242
+ #include <dlfcn.h>
243
+ #include <stdbool.h>
244
+ #include <dlfcn.h>
245
+
246
+ // The list of paths to search for the HIP runtime library. The caller Python
247
+ // code should substitute the search path placeholder.
248
+ static const char *hipLibSearchPaths[] = {{"{libhip_path}"}};
249
+
250
+ // The list of HIP dynamic library symbols and their signature we are interested
251
+ // in this file.
252
+ #define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN) \\
253
+ FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError) \\
254
+ FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f, \\
255
+ unsigned int gridDimX, unsigned int gridDimY, \\
256
+ unsigned int gridDimZ, unsigned int blockDimX, \\
257
+ unsigned int blockDimY, unsigned int blockDimZ, \\
258
+ unsigned int sharedMemBytes, hipStream_t stream, \\
259
+ void **kernelParams, void **extra) \\
260
+ FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data, \\
261
+ hipPointer_attribute attribute, hipDeviceptr_t ptr)
262
+
263
+ // The HIP symbol table for holding resolved dynamic library symbols.
264
+ struct HIPSymbolTable {{
265
+ #define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...) \\
266
+ hipError_t (*hipSymbolName)(__VA_ARGS__);
267
+ #define DEFINE_EACH_STR_FIELD(hipSymbolName, ...) \\
268
+ const char *(*hipSymbolName)(__VA_ARGS__);
269
+
270
+ HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
271
+ }};
272
+
273
+ static struct HIPSymbolTable hipSymbolTable;
274
+
275
+ bool initSymbolTable() {{
276
+ // Use the HIP runtime library loaded into the existing process if it exits.
277
+ void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
278
+ if (lib) {{
279
+ // printf("[triton] chosen loaded libamdhip64.so in the process\\n");
280
+ }}
281
+
282
+ // Otherwise, go through the list of search paths to dlopen the first HIP
283
+ // driver library.
284
+ if (!lib) {{
285
+ int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
286
+ for (int i = 0; i < n; ++i) {{
287
+ void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
288
+ if (handle) {{
289
+ lib = handle;
290
+ // printf("[triton] chosen %s\\n", hipLibSearchPaths[i]);
291
+ }}
292
+ }}
293
+ }}
294
+ if (!lib) {{
295
+ PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
296
+ return false;
297
+ }}
298
+
299
+ // Resolve all symbols we are interested in.
300
+ dlerror(); // Clear existing errors
301
+ const char *error = NULL;
302
+ #define QUERY_EACH_FN(hipSymbolName, ...) \\
303
+ *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName); \\
304
+ error = dlerror(); \\
305
+ if (error) {{ \\
306
+ PyErr_SetString(PyExc_RuntimeError, \\
307
+ "cannot query " #hipSymbolName " from libamdhip64.so"); \\
308
+ dlclose(lib); \\
309
+ return false; \\
310
+ }}
311
+
312
+ HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
313
+
314
+ return true;
315
+ }}
316
+
317
+ static inline void gpuAssert(hipError_t code, const char *file, int line)
318
+ {{
319
+ if (code != HIP_SUCCESS)
320
+ {{
321
+ const char* prefix = "Triton Error [HIP]: ";
322
+ const char* str = hipSymbolTable.hipGetErrorString(code);
323
+ char err[1024] = {{0}};
324
+ snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
325
+ PyErr_SetString(PyExc_RuntimeError, err);
326
+ }}
327
+ }}
328
+
329
+ #define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
330
+
331
+ static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
332
+ // printf("_launch hip kernel\\n");
333
+ void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};
334
+ if (gridX*gridY*gridZ > 0) {{
335
+ HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
336
+ }}
337
+ }}
338
+
339
+ typedef struct _DevicePtrInfo {{
340
+ hipDeviceptr_t dev_ptr;
341
+ bool valid;
342
+ }} DevicePtrInfo;
343
+
344
+ static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
345
+ DevicePtrInfo ptr_info;
346
+ ptr_info.dev_ptr = 0;
347
+ ptr_info.valid = true;
348
+ if (PyLong_Check(obj)) {{
349
+ ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
350
+ return ptr_info;
351
+ }}
352
+ if (obj == Py_None) {{
353
+ // valid nullptr
354
+ return ptr_info;
355
+ }}
356
+ PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
357
+ if(ptr){{
358
+ PyObject *empty_tuple = PyTuple_New(0);
359
+ PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
360
+ Py_DECREF(empty_tuple);
361
+ Py_DECREF(ptr);
362
+ if (!PyLong_Check(ret)) {{
363
+ PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
364
+ ptr_info.valid = false;
365
+ return ptr_info;
366
+ }}
367
+ ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
368
+ if(!ptr_info.dev_ptr)
369
+ return ptr_info;
370
+ uint64_t dev_ptr;
371
+ hipError_t status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
372
+ if (status == hipErrorInvalidValue) {{
373
+ PyErr_Format(PyExc_ValueError,
374
+ "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
375
+ ptr_info.valid = false;
376
+ }}
377
+ ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
378
+ Py_DECREF(ret);
379
+ return ptr_info;
380
+ }}
381
+ PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
382
+ return ptr_info;
383
+ }}
384
+
385
+ static PyObject* launch(PyObject* self, PyObject* args) {{
386
+ // printf("launch\\n");
387
+ int gridX, gridY, gridZ;
388
+ uint64_t _stream;
389
+ uint64_t _function;
390
+ PyObject *launch_enter_hook = NULL;
391
+ PyObject *launch_exit_hook = NULL;
392
+ PyObject *kernel_metadata = NULL;
393
+ PyObject *launch_metadata = NULL;
394
+ {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
395
+ if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function,
396
+ &kernel_metadata, &launch_metadata,
397
+ &launch_enter_hook, &launch_exit_hook {args_list})) {{
398
+ return NULL;
399
+ }}
400
+
401
+ // extract kernel metadata
402
+ int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
403
+ if (!PyArg_ParseTuple(kernel_metadata, \"iiiiii\", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {{
404
+ return NULL;
405
+ }}
406
+ // extract launch metadata
407
+ if (launch_enter_hook != Py_None){{
408
+ PyObject* args = Py_BuildValue("(O)", launch_metadata);
409
+ PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
410
+ Py_DECREF(args);
411
+ if (!ret)
412
+ return NULL;
413
+ }}
414
+
415
+
416
+ // raise exception asap
417
+ {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
418
+ _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function{', ' + ', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items()) if len(signature) > 0 else ''});
419
+
420
+ if(launch_exit_hook != Py_None){{
421
+ PyObject* args = Py_BuildValue("(O)", launch_metadata);
422
+ PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
423
+ Py_DECREF(args);
424
+ if (!ret)
425
+ return NULL;
426
+ }}
427
+
428
+ if(PyErr_Occurred()) {{
429
+ return NULL;
430
+ }}
431
+ // return None
432
+ Py_INCREF(Py_None);
433
+ return Py_None;
434
+ }}
435
+
436
+ static PyMethodDef ModuleMethods[] = {{
437
+ {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
438
+ {{NULL, NULL, 0, NULL}} // sentinel
439
+ }};
440
+
441
+ static struct PyModuleDef ModuleDef = {{
442
+ PyModuleDef_HEAD_INIT,
443
+ \"__triton_launcher\",
444
+ NULL, //documentation
445
+ -1, //size
446
+ ModuleMethods
447
+ }};
448
+
449
+ PyMODINIT_FUNC PyInit___triton_launcher(void) {{
450
+ if (!initSymbolTable()) {{
451
+ return NULL;
452
+ }}
453
+ PyObject *m = PyModule_Create(&ModuleDef);
454
+ if(m == NULL) {{
455
+ return NULL;
456
+ }}
457
+ PyModule_AddFunctions(m, ModuleMethods);
458
+ return m;
459
+ }}
460
+ """
461
+ return src
462
+
463
+
464
+ class HIPLauncher(object):
465
+
466
+ def __init__(self, src, metadata):
467
+ ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
468
+ constants = src.constants if hasattr(src, "constants") else dict()
469
+ cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
470
+ constants = {cst_key(key): value for key, value in constants.items()}
471
+ signature = {cst_key(key): value for key, value in src.signature.items()}
472
+ src = make_launcher(constants, signature, ids, metadata.warp_size)
473
+ mod = compile_module_from_src(src, "__triton_launcher")
474
+ self.launch = mod.launch
475
+
476
+ def __call__(self, *args, **kwargs):
477
+ self.launch(*args, **kwargs)
478
+
479
+
480
+ class HIPDriver(GPUDriver):
481
+
482
+ def __init__(self):
483
+ super().__init__()
484
+ self.utils = HIPUtils()
485
+ self.launcher_cls = HIPLauncher
486
+
487
+ @staticmethod
488
+ def is_active():
489
+ import torch
490
+ return torch.version.hip is not None
491
+
492
+ def get_current_target(self):
493
+ device = self.get_current_device()
494
+ device_properties = self.utils.get_device_properties(device)
495
+ arch = device_properties['arch']
496
+ warp_size = device_properties['warpSize']
497
+ return GPUTarget("hip", arch.split(':')[0], warp_size)
.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/hip_runtime.h ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ of this software and associated documentation files (the "Software"), to deal
6
+ in the Software without restriction, including without limitation the rights
7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is
9
+ furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in
12
+ all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ THE SOFTWARE.
21
+ */
22
+
23
+ //! HIP = Heterogeneous-compute Interface for Portability
24
+ //!
25
+ //! Define a extremely thin runtime layer that allows source code to be compiled unmodified
26
+ //! through either AMD CLANG or NVCC. Key features tend to be in the spirit
27
+ //! and terminology of CUDA, but with a portable path to other accelerators as well:
28
+ //
29
+ //! Both paths support rich C++ features including classes, templates, lambdas, etc.
30
+ //! Runtime API is C
31
+ //! Memory management is based on pure pointers and resembles malloc/free/copy.
32
+ //
33
+ //! hip_runtime.h : includes everything in hip_api.h, plus math builtins and kernel launch
34
+ //! macros. hip_runtime_api.h : Defines HIP API. This is a C header file and does not use any C++
35
+ //! features.
36
+
37
+ #ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_H
38
+ #define HIP_INCLUDE_HIP_HIP_RUNTIME_H
39
+
40
+ #if __HIP_DEVICE_COMPILE__ && !__GFX7__ && !__GFX8__ && !__GFX9__ && __AMDGCN_WAVEFRONT_SIZE == 64
41
+ #error HIP is not supported on the specified GPU ARCH with wavefront size 64
42
+ #endif
43
+
44
+ #if !defined(__HIPCC_RTC__)
45
+ // Some standard header files, these are included by hc.hpp and so want to make them avail on both
46
+ // paths to provide a consistent include env and avoid "missing symbol" errors that only appears
47
+ // on NVCC path:
48
+ #include <stdint.h>
49
+ #include <stdio.h>
50
+ #include <stdlib.h>
51
+ #include <assert.h>
52
+
53
+ #if __cplusplus > 199711L
54
+ #include <thread>
55
+ #endif
56
+ #endif // !defined(__HIPCC_RTC__)
57
+
58
+ #include <hip/hip_version.h>
59
+ #include <hip/hip_common.h>
60
+
61
+ #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
62
+ #include <hip/amd_detail/amd_hip_runtime.h>
63
+ #elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
64
+ #include <hip/nvidia_detail/nvidia_hip_runtime.h>
65
+ #else
66
+ #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
67
+ #endif
68
+
69
+ #if !defined(__HIPCC_RTC__)
70
+ #include <hip/hip_runtime_api.h>
71
+ #include <hip/library_types.h>
72
+ #endif // !defined(__HIPCC_RTC__)
73
+ #include <hip/hip_vector_types.h>
74
+
75
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/amd/include/hip/texture_types.h ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ of this software and associated documentation files (the "Software"), to deal
6
+ in the Software without restriction, including without limitation the rights
7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is
9
+ furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in
12
+ all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ THE SOFTWARE.
21
+ */
22
+
23
+ #ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
24
+ #define HIP_INCLUDE_HIP_TEXTURE_TYPES_H
25
+
26
+ #if defined(__clang__)
27
+ #pragma clang diagnostic push
28
+ #pragma clang diagnostic ignored "-Wreserved-identifier"
29
+ #pragma clang diagnostic ignored "-Wreserved-macro-identifier"
30
+ #pragma clang diagnostic ignored "-Wc++98-compat"
31
+ #endif
32
+
33
+ #if !defined(__HIPCC_RTC__)
34
+ #include <hip/hip_common.h>
35
+ #endif
36
+
37
+ #if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
38
+ #include "texture_types.h"
39
+ #elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
40
+ /*******************************************************************************
41
+ * *
42
+ * *
43
+ * *
44
+ *******************************************************************************/
45
+ #if !defined(__HIPCC_RTC__)
46
+ #include <limits.h>
47
+ #include <hip/channel_descriptor.h>
48
+ #include <hip/driver_types.h>
49
+ #endif // !defined(__HIPCC_RTC__)
50
+
51
+ #define hipTextureType1D 0x01
52
+ #define hipTextureType2D 0x02
53
+ #define hipTextureType3D 0x03
54
+ #define hipTextureTypeCubemap 0x0C
55
+ #define hipTextureType1DLayered 0xF1
56
+ #define hipTextureType2DLayered 0xF2
57
+ #define hipTextureTypeCubemapLayered 0xFC
58
+
59
+ /**
60
+ * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
61
+ */
62
+ #define HIP_IMAGE_OBJECT_SIZE_DWORD 12
63
+ #define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
64
+ #define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
65
+ #define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
66
+
67
+ /**
68
+ * An opaque value that represents a hip texture object
69
+ */
70
+ struct __hip_texture;
71
+ typedef struct __hip_texture* hipTextureObject_t;
72
+
73
+ /**
74
+ * hip texture address modes
75
+ */
76
+ enum hipTextureAddressMode {
77
+ hipAddressModeWrap = 0,
78
+ hipAddressModeClamp = 1,
79
+ hipAddressModeMirror = 2,
80
+ hipAddressModeBorder = 3
81
+ };
82
+
83
+ /**
84
+ * hip texture filter modes
85
+ */
86
+ enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
87
+
88
+ /**
89
+ * hip texture read modes
90
+ */
91
+ enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
92
+
93
+ /**
94
+ * hip texture reference
95
+ */
96
+ typedef struct textureReference {
97
+ int normalized;
98
+ enum hipTextureReadMode readMode;// used only for driver API's
99
+ enum hipTextureFilterMode filterMode;
100
+ enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
101
+ struct hipChannelFormatDesc channelDesc;
102
+ int sRGB; // Perform sRGB->linear conversion during texture read
103
+ unsigned int maxAnisotropy; // Limit to the anisotropy ratio
104
+ enum hipTextureFilterMode mipmapFilterMode;
105
+ float mipmapLevelBias;
106
+ float minMipmapLevelClamp;
107
+ float maxMipmapLevelClamp;
108
+
109
+ hipTextureObject_t textureObject;
110
+ int numChannels;
111
+ enum hipArray_Format format;
112
+ }textureReference;
113
+
114
+ /**
115
+ * hip texture descriptor
116
+ */
117
+ typedef struct hipTextureDesc {
118
+ enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
119
+ enum hipTextureFilterMode filterMode;
120
+ enum hipTextureReadMode readMode;
121
+ int sRGB; // Perform sRGB->linear conversion during texture read
122
+ float borderColor[4];
123
+ int normalizedCoords;
124
+ unsigned int maxAnisotropy;
125
+ enum hipTextureFilterMode mipmapFilterMode;
126
+ float mipmapLevelBias;
127
+ float minMipmapLevelClamp;
128
+ float maxMipmapLevelClamp;
129
+ }hipTextureDesc;
130
+
131
+ #if __cplusplus
132
+
133
+ /*******************************************************************************
134
+ * *
135
+ * *
136
+ * *
137
+ *******************************************************************************/
138
+ #if __HIP__
139
+ #define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
140
+ #else
141
+ #define __HIP_TEXTURE_ATTRIB
142
+ #endif
143
+
144
+ typedef textureReference* hipTexRef;
145
+
146
+ template <class T, int texType = hipTextureType1D,
147
+ enum hipTextureReadMode mode = hipReadModeElementType>
148
+ struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
149
+ texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
150
+ enum hipTextureAddressMode aMode = hipAddressModeClamp) {
151
+ normalized = norm;
152
+ readMode = mode;
153
+ filterMode = fMode;
154
+ addressMode[0] = aMode;
155
+ addressMode[1] = aMode;
156
+ addressMode[2] = aMode;
157
+ channelDesc = hipCreateChannelDesc<T>();
158
+ sRGB = 0;
159
+ textureObject = nullptr;
160
+ maxAnisotropy = 0;
161
+ mipmapLevelBias = 0;
162
+ minMipmapLevelClamp = 0;
163
+ maxMipmapLevelClamp = 0;
164
+ }
165
+
166
+ texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
167
+ struct hipChannelFormatDesc desc) {
168
+ normalized = norm;
169
+ readMode = mode;
170
+ filterMode = fMode;
171
+ addressMode[0] = aMode;
172
+ addressMode[1] = aMode;
173
+ addressMode[2] = aMode;
174
+ channelDesc = desc;
175
+ sRGB = 0;
176
+ textureObject = nullptr;
177
+ maxAnisotropy = 0;
178
+ mipmapLevelBias = 0;
179
+ minMipmapLevelClamp = 0;
180
+ maxMipmapLevelClamp = 0;
181
+ }
182
+ };
183
+
184
+ #endif /* __cplusplus */
185
+
186
+ #else
187
+ #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
188
+ #endif
189
+
190
+ #if defined(__clang__)
191
+ #pragma clang diagnostic pop
192
+ #endif
193
+
194
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (195 Bytes). View file
 
.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-311.pyc ADDED
Binary file (25.5 kB). View file
 
.venv/lib/python3.11/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-311.pyc ADDED
Binary file (22.6 kB). View file
 
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openacc/cupti_openacc.h ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #include <cuda_stdint.h>
51
+
52
+ #if !defined(_CUPTI_OPENACC_H_)
53
+ #define _CUPTI_OPENACC_H_
54
+
55
+ #ifndef CUPTIAPI
56
+ #ifdef _WIN32
57
+ #define CUPTIAPI __stdcall
58
+ #else
59
+ #define CUPTIAPI
60
+ #endif
61
+ #endif
62
+
63
+ #if defined(__LP64__)
64
+ #define CUPTILP64 1
65
+ #elif defined(_WIN64)
66
+ #define CUPTILP64 1
67
+ #else
68
+ #undef CUPTILP64
69
+ #endif
70
+
71
+ #if defined(__cplusplus)
72
+ extern "C" {
73
+ #endif
74
+
75
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
76
+ #pragma GCC visibility push(default)
77
+ #endif
78
+
79
+ /**
80
+ * \brief Initialize OpenACC support
81
+ *
82
+ * \param profRegister function of type acc_prof_reg as obtained from acc_register_library
83
+ * \param profUnregister function of type acc_prof_reg as obtained from acc_register_library
84
+ * \param profLookup function of type acc_prof_lookup as obtained from acc_register_library
85
+ */
86
+ CUptiResult CUPTIAPI
87
+ cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup);
88
+
89
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
90
+ #pragma GCC visibility pop
91
+ #endif
92
+
93
+ #if defined(__cplusplus)
94
+ }
95
+ #endif
96
+
97
+ #endif /*_CUPTI_OPENACC_H_*/
98
+
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/cupti_openmp.h ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #include <cuda_stdint.h>
51
+ #include "Openmp/omp-tools.h"
52
+
53
+ #if !defined(_CUPTI_OPENMP_H_)
54
+ #define _CUPTI_OPENMP_H_
55
+
56
+ #ifndef CUPTIAPI
57
+ #ifdef _WIN32
58
+ #define CUPTIAPI __stdcall
59
+ #else
60
+ #define CUPTIAPI
61
+ #endif
62
+ #endif
63
+
64
+ #if defined(__LP64__)
65
+ #define CUPTILP64 1
66
+ #elif defined(_WIN64)
67
+ #define CUPTILP64 1
68
+ #else
69
+ #undef CUPTILP64
70
+ #endif
71
+
72
+ #if defined(__cplusplus)
73
+ extern "C" {
74
+ #endif
75
+
76
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
77
+ #pragma GCC visibility push(default)
78
+ #endif
79
+
80
+ /**
81
+ * \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
82
+ *
83
+ */
84
+ int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
85
+
86
+ /**
87
+ * \brief Initialize OPENMP support
88
+ *
89
+ */
90
+ int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
91
+
92
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
93
+ #pragma GCC visibility pop
94
+ #endif
95
+
96
+ #if defined(__cplusplus)
97
+ }
98
+ #endif
99
+
100
+ #endif /*_CUPTI_OPENMP_H_*/
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/Openmp/omp-tools.h ADDED
@@ -0,0 +1,1083 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * include/50/omp-tools.h.var
3
+ */
4
+
5
+ //===----------------------------------------------------------------------===//
6
+ //
7
+ // The LLVM Compiler Infrastructure
8
+ //
9
+ // This file is dual licensed under the MIT and the University of Illinois Open
10
+ // Source Licenses. See LICENSE.txt for details.
11
+ //
12
+ //===----------------------------------------------------------------------===//
13
+
14
+ #ifndef __OMPT__
15
+ #define __OMPT__
16
+
17
+ /*****************************************************************************
18
+ * system include files
19
+ *****************************************************************************/
20
+
21
+ #include <stdint.h>
22
+ #include <stddef.h>
23
+
24
+ /*****************************************************************************
25
+ * iteration macros
26
+ *****************************************************************************/
27
+
28
+ #define FOREACH_OMPT_INQUIRY_FN(macro) \
29
+ macro (ompt_enumerate_states) \
30
+ macro (ompt_enumerate_mutex_impls) \
31
+ \
32
+ macro (ompt_set_callback) \
33
+ macro (ompt_get_callback) \
34
+ \
35
+ macro (ompt_get_state) \
36
+ \
37
+ macro (ompt_get_parallel_info) \
38
+ macro (ompt_get_task_info) \
39
+ macro (ompt_get_task_memory) \
40
+ macro (ompt_get_thread_data) \
41
+ macro (ompt_get_unique_id) \
42
+ macro (ompt_finalize_tool) \
43
+ \
44
+ macro(ompt_get_num_procs) \
45
+ macro(ompt_get_num_places) \
46
+ macro(ompt_get_place_proc_ids) \
47
+ macro(ompt_get_place_num) \
48
+ macro(ompt_get_partition_place_nums) \
49
+ macro(ompt_get_proc_id) \
50
+ \
51
+ macro(ompt_get_target_info) \
52
+ macro(ompt_get_num_devices)
53
+
54
+ #define FOREACH_OMPT_STATE(macro) \
55
+ \
56
+ /* first available state */ \
57
+ macro (ompt_state_undefined, 0x102) /* undefined thread state */ \
58
+ \
59
+ /* work states (0..15) */ \
60
+ macro (ompt_state_work_serial, 0x000) /* working outside parallel */ \
61
+ macro (ompt_state_work_parallel, 0x001) /* working within parallel */ \
62
+ macro (ompt_state_work_reduction, 0x002) /* performing a reduction */ \
63
+ \
64
+ /* barrier wait states (16..31) */ \
65
+ macro (ompt_state_wait_barrier, 0x010) /* waiting at a barrier */ \
66
+ macro (ompt_state_wait_barrier_implicit_parallel, 0x011) \
67
+ /* implicit barrier at the end of parallel region */\
68
+ macro (ompt_state_wait_barrier_implicit_workshare, 0x012) \
69
+ /* implicit barrier at the end of worksharing */ \
70
+ macro (ompt_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \
71
+ macro (ompt_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \
72
+ \
73
+ /* task wait states (32..63) */ \
74
+ macro (ompt_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \
75
+ macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */ \
76
+ \
77
+ /* mutex wait states (64..127) */ \
78
+ macro (ompt_state_wait_mutex, 0x040) \
79
+ macro (ompt_state_wait_lock, 0x041) /* waiting for lock */ \
80
+ macro (ompt_state_wait_critical, 0x042) /* waiting for critical */ \
81
+ macro (ompt_state_wait_atomic, 0x043) /* waiting for atomic */ \
82
+ macro (ompt_state_wait_ordered, 0x044) /* waiting for ordered */ \
83
+ \
84
+ /* target wait states (128..255) */ \
85
+ macro (ompt_state_wait_target, 0x080) /* waiting for target region */ \
86
+ macro (ompt_state_wait_target_map, 0x081) /* waiting for target data mapping operation */ \
87
+ macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */ \
88
+ \
89
+ /* misc (256..511) */ \
90
+ macro (ompt_state_idle, 0x100) /* waiting for work */ \
91
+ macro (ompt_state_overhead, 0x101) /* overhead excluding wait states */ \
92
+ \
93
+ /* implementation-specific states (512..) */
94
+
95
+
96
+ #define FOREACH_KMP_MUTEX_IMPL(macro) \
97
+ macro (kmp_mutex_impl_none, 0) /* unknown implementation */ \
98
+ macro (kmp_mutex_impl_spin, 1) /* based on spin */ \
99
+ macro (kmp_mutex_impl_queuing, 2) /* based on some fair policy */ \
100
+ macro (kmp_mutex_impl_speculative, 3) /* based on HW-supported speculation */
101
+
102
+ #define FOREACH_OMPT_EVENT(macro) \
103
+ \
104
+ /*--- Mandatory Events ---*/ \
105
+ macro (ompt_callback_thread_begin, ompt_callback_thread_begin_t, 1) /* thread begin */ \
106
+ macro (ompt_callback_thread_end, ompt_callback_thread_end_t, 2) /* thread end */ \
107
+ \
108
+ macro (ompt_callback_parallel_begin, ompt_callback_parallel_begin_t, 3) /* parallel begin */ \
109
+ macro (ompt_callback_parallel_end, ompt_callback_parallel_end_t, 4) /* parallel end */ \
110
+ \
111
+ macro (ompt_callback_task_create, ompt_callback_task_create_t, 5) /* task begin */ \
112
+ macro (ompt_callback_task_schedule, ompt_callback_task_schedule_t, 6) /* task schedule */ \
113
+ macro (ompt_callback_implicit_task, ompt_callback_implicit_task_t, 7) /* implicit task */ \
114
+ \
115
+ macro (ompt_callback_target, ompt_callback_target_t, 8) /* target */ \
116
+ macro (ompt_callback_target_data_op, ompt_callback_target_data_op_t, 9) /* target data op */ \
117
+ macro (ompt_callback_target_submit, ompt_callback_target_submit_t, 10) /* target submit */ \
118
+ \
119
+ macro (ompt_callback_control_tool, ompt_callback_control_tool_t, 11) /* control tool */ \
120
+ \
121
+ macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize */ \
122
+ macro (ompt_callback_device_finalize, ompt_callback_device_finalize_t, 13) /* device finalize */ \
123
+ \
124
+ macro (ompt_callback_device_load, ompt_callback_device_load_t, 14) /* device load */ \
125
+ macro (ompt_callback_device_unload, ompt_callback_device_unload_t, 15) /* device unload */ \
126
+ \
127
+ /* Optional Events */ \
128
+ macro (ompt_callback_sync_region_wait, ompt_callback_sync_region_t, 16) /* sync region wait begin or end */ \
129
+ \
130
+ macro (ompt_callback_mutex_released, ompt_callback_mutex_t, 17) /* mutex released */ \
131
+ \
132
+ macro (ompt_callback_dependences, ompt_callback_dependences_t, 18) /* report task dependences */ \
133
+ macro (ompt_callback_task_dependence, ompt_callback_task_dependence_t, 19) /* report task dependence */ \
134
+ \
135
+ macro (ompt_callback_work, ompt_callback_work_t, 20) /* task at work begin or end */ \
136
+ \
137
+ macro (ompt_callback_master, ompt_callback_master_t, 21) /* task at master begin or end */ \
138
+ \
139
+ macro (ompt_callback_target_map, ompt_callback_target_map_t, 22) /* target map */ \
140
+ \
141
+ macro (ompt_callback_sync_region, ompt_callback_sync_region_t, 23) /* sync region begin or end */ \
142
+ \
143
+ macro (ompt_callback_lock_init, ompt_callback_mutex_acquire_t, 24) /* lock init */ \
144
+ macro (ompt_callback_lock_destroy, ompt_callback_mutex_t, 25) /* lock destroy */ \
145
+ \
146
+ macro (ompt_callback_mutex_acquire, ompt_callback_mutex_acquire_t, 26) /* mutex acquire */ \
147
+ macro (ompt_callback_mutex_acquired, ompt_callback_mutex_t, 27) /* mutex acquired */ \
148
+ \
149
+ macro (ompt_callback_nest_lock, ompt_callback_nest_lock_t, 28) /* nest lock */ \
150
+ \
151
+ macro (ompt_callback_flush, ompt_callback_flush_t, 29) /* after executing flush */ \
152
+ \
153
+ macro (ompt_callback_cancel, ompt_callback_cancel_t, 30) /* cancel innermost binding region */ \
154
+ \
155
+ macro (ompt_callback_reduction, ompt_callback_sync_region_t, 31) /* reduction */ \
156
+ \
157
+ macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */
158
+
159
+ /*****************************************************************************
160
+ * implementation specific types
161
+ *****************************************************************************/
162
+
163
+ typedef enum kmp_mutex_impl_t {
164
+ #define kmp_mutex_impl_macro(impl, code) impl = code,
165
+ FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
166
+ #undef kmp_mutex_impl_macro
167
+ } kmp_mutex_impl_t;
168
+
169
+ /*****************************************************************************
170
+ * definitions generated from spec
171
+ *****************************************************************************/
172
+
173
+ typedef enum ompt_callbacks_t {
174
+ ompt_callback_thread_begin = 1,
175
+ ompt_callback_thread_end = 2,
176
+ ompt_callback_parallel_begin = 3,
177
+ ompt_callback_parallel_end = 4,
178
+ ompt_callback_task_create = 5,
179
+ ompt_callback_task_schedule = 6,
180
+ ompt_callback_implicit_task = 7,
181
+ ompt_callback_target = 8,
182
+ ompt_callback_target_data_op = 9,
183
+ ompt_callback_target_submit = 10,
184
+ ompt_callback_control_tool = 11,
185
+ ompt_callback_device_initialize = 12,
186
+ ompt_callback_device_finalize = 13,
187
+ ompt_callback_device_load = 14,
188
+ ompt_callback_device_unload = 15,
189
+ ompt_callback_sync_region_wait = 16,
190
+ ompt_callback_mutex_released = 17,
191
+ ompt_callback_dependences = 18,
192
+ ompt_callback_task_dependence = 19,
193
+ ompt_callback_work = 20,
194
+ ompt_callback_master = 21,
195
+ ompt_callback_target_map = 22,
196
+ ompt_callback_sync_region = 23,
197
+ ompt_callback_lock_init = 24,
198
+ ompt_callback_lock_destroy = 25,
199
+ ompt_callback_mutex_acquire = 26,
200
+ ompt_callback_mutex_acquired = 27,
201
+ ompt_callback_nest_lock = 28,
202
+ ompt_callback_flush = 29,
203
+ ompt_callback_cancel = 30,
204
+ ompt_callback_reduction = 31,
205
+ ompt_callback_dispatch = 32
206
+ } ompt_callbacks_t;
207
+
208
+ typedef enum ompt_record_t {
209
+ ompt_record_ompt = 1,
210
+ ompt_record_native = 2,
211
+ ompt_record_invalid = 3
212
+ } ompt_record_t;
213
+
214
+ typedef enum ompt_record_native_t {
215
+ ompt_record_native_info = 1,
216
+ ompt_record_native_event = 2
217
+ } ompt_record_native_t;
218
+
219
+ typedef enum ompt_set_result_t {
220
+ ompt_set_error = 0,
221
+ ompt_set_never = 1,
222
+ ompt_set_impossible = 2,
223
+ ompt_set_sometimes = 3,
224
+ ompt_set_sometimes_paired = 4,
225
+ ompt_set_always = 5
226
+ } ompt_set_result_t;
227
+
228
+ typedef uint64_t ompt_id_t;
229
+
230
+ typedef uint64_t ompt_device_time_t;
231
+
232
+ typedef uint64_t ompt_buffer_cursor_t;
233
+
234
+ typedef enum ompt_thread_t {
235
+ ompt_thread_initial = 1,
236
+ ompt_thread_worker = 2,
237
+ ompt_thread_other = 3,
238
+ ompt_thread_unknown = 4
239
+ } ompt_thread_t;
240
+
241
+ typedef enum ompt_scope_endpoint_t {
242
+ ompt_scope_begin = 1,
243
+ ompt_scope_end = 2
244
+ } ompt_scope_endpoint_t;
245
+
246
+ typedef enum ompt_dispatch_t {
247
+ ompt_dispatch_iteration = 1,
248
+ ompt_dispatch_section = 2
249
+ } ompt_dispatch_t;
250
+
251
+ typedef enum ompt_sync_region_t {
252
+ ompt_sync_region_barrier = 1,
253
+ ompt_sync_region_barrier_implicit = 2,
254
+ ompt_sync_region_barrier_explicit = 3,
255
+ ompt_sync_region_barrier_implementation = 4,
256
+ ompt_sync_region_taskwait = 5,
257
+ ompt_sync_region_taskgroup = 6,
258
+ ompt_sync_region_reduction = 7
259
+ } ompt_sync_region_t;
260
+
261
+ typedef enum ompt_target_data_op_t {
262
+ ompt_target_data_alloc = 1,
263
+ ompt_target_data_transfer_to_device = 2,
264
+ ompt_target_data_transfer_from_device = 3,
265
+ ompt_target_data_delete = 4,
266
+ ompt_target_data_associate = 5,
267
+ ompt_target_data_disassociate = 6
268
+ } ompt_target_data_op_t;
269
+
270
+ typedef enum ompt_work_t {
271
+ ompt_work_loop = 1,
272
+ ompt_work_sections = 2,
273
+ ompt_work_single_executor = 3,
274
+ ompt_work_single_other = 4,
275
+ ompt_work_workshare = 5,
276
+ ompt_work_distribute = 6,
277
+ ompt_work_taskloop = 7
278
+ } ompt_work_t;
279
+
280
+ typedef enum ompt_mutex_t {
281
+ ompt_mutex_lock = 1,
282
+ ompt_mutex_test_lock = 2,
283
+ ompt_mutex_nest_lock = 3,
284
+ ompt_mutex_test_nest_lock = 4,
285
+ ompt_mutex_critical = 5,
286
+ ompt_mutex_atomic = 6,
287
+ ompt_mutex_ordered = 7
288
+ } ompt_mutex_t;
289
+
290
+ typedef enum ompt_native_mon_flag_t {
291
+ ompt_native_data_motion_explicit = 0x01,
292
+ ompt_native_data_motion_implicit = 0x02,
293
+ ompt_native_kernel_invocation = 0x04,
294
+ ompt_native_kernel_execution = 0x08,
295
+ ompt_native_driver = 0x10,
296
+ ompt_native_runtime = 0x20,
297
+ ompt_native_overhead = 0x40,
298
+ ompt_native_idleness = 0x80
299
+ } ompt_native_mon_flag_t;
300
+
301
+ typedef enum ompt_task_flag_t {
302
+ ompt_task_initial = 0x00000001,
303
+ ompt_task_implicit = 0x00000002,
304
+ ompt_task_explicit = 0x00000004,
305
+ ompt_task_target = 0x00000008,
306
+ ompt_task_undeferred = 0x08000000,
307
+ ompt_task_untied = 0x10000000,
308
+ ompt_task_final = 0x20000000,
309
+ ompt_task_mergeable = 0x40000000,
310
+ ompt_task_merged = 0x80000000
311
+ } ompt_task_flag_t;
312
+
313
+ typedef enum ompt_task_status_t {
314
+ ompt_task_complete = 1,
315
+ ompt_task_yield = 2,
316
+ ompt_task_cancel = 3,
317
+ ompt_task_detach = 4,
318
+ ompt_task_early_fulfill = 5,
319
+ ompt_task_late_fulfill = 6,
320
+ ompt_task_switch = 7
321
+ } ompt_task_status_t;
322
+
323
+ typedef enum ompt_target_t {
324
+ ompt_target = 1,
325
+ ompt_target_enter_data = 2,
326
+ ompt_target_exit_data = 3,
327
+ ompt_target_update = 4
328
+ } ompt_target_t;
329
+
330
+ typedef enum ompt_parallel_flag_t {
331
+ ompt_parallel_invoker_program = 0x00000001,
332
+ ompt_parallel_invoker_runtime = 0x00000002,
333
+ ompt_parallel_league = 0x40000000,
334
+ ompt_parallel_team = 0x80000000
335
+ } ompt_parallel_flag_t;
336
+
337
+ typedef enum ompt_target_map_flag_t {
338
+ ompt_target_map_flag_to = 0x01,
339
+ ompt_target_map_flag_from = 0x02,
340
+ ompt_target_map_flag_alloc = 0x04,
341
+ ompt_target_map_flag_release = 0x08,
342
+ ompt_target_map_flag_delete = 0x10,
343
+ ompt_target_map_flag_implicit = 0x20
344
+ } ompt_target_map_flag_t;
345
+
346
+ typedef enum ompt_dependence_type_t {
347
+ ompt_dependence_type_in = 1,
348
+ ompt_dependence_type_out = 2,
349
+ ompt_dependence_type_inout = 3,
350
+ ompt_dependence_type_mutexinoutset = 4,
351
+ ompt_dependence_type_source = 5,
352
+ ompt_dependence_type_sink = 6
353
+ } ompt_dependence_type_t;
354
+
355
+ typedef enum ompt_cancel_flag_t {
356
+ ompt_cancel_parallel = 0x01,
357
+ ompt_cancel_sections = 0x02,
358
+ ompt_cancel_loop = 0x04,
359
+ ompt_cancel_taskgroup = 0x08,
360
+ ompt_cancel_activated = 0x10,
361
+ ompt_cancel_detected = 0x20,
362
+ ompt_cancel_discarded_task = 0x40
363
+ } ompt_cancel_flag_t;
364
+
365
+ typedef uint64_t ompt_hwid_t;
366
+
367
+ typedef uint64_t ompt_wait_id_t;
368
+
369
+ typedef enum ompt_frame_flag_t {
370
+ ompt_frame_runtime = 0x00,
371
+ ompt_frame_application = 0x01,
372
+ ompt_frame_cfa = 0x10,
373
+ ompt_frame_framepointer = 0x20,
374
+ ompt_frame_stackaddress = 0x30
375
+ } ompt_frame_flag_t;
376
+
377
+ typedef enum ompt_state_t {
378
+ ompt_state_work_serial = 0x000,
379
+ ompt_state_work_parallel = 0x001,
380
+ ompt_state_work_reduction = 0x002,
381
+
382
+ ompt_state_wait_barrier = 0x010,
383
+ ompt_state_wait_barrier_implicit_parallel = 0x011,
384
+ ompt_state_wait_barrier_implicit_workshare = 0x012,
385
+ ompt_state_wait_barrier_implicit = 0x013,
386
+ ompt_state_wait_barrier_explicit = 0x014,
387
+
388
+ ompt_state_wait_taskwait = 0x020,
389
+ ompt_state_wait_taskgroup = 0x021,
390
+
391
+ ompt_state_wait_mutex = 0x040,
392
+ ompt_state_wait_lock = 0x041,
393
+ ompt_state_wait_critical = 0x042,
394
+ ompt_state_wait_atomic = 0x043,
395
+ ompt_state_wait_ordered = 0x044,
396
+
397
+ ompt_state_wait_target = 0x080,
398
+ ompt_state_wait_target_map = 0x081,
399
+ ompt_state_wait_target_update = 0x082,
400
+
401
+ ompt_state_idle = 0x100,
402
+ ompt_state_overhead = 0x101,
403
+ ompt_state_undefined = 0x102
404
+ } ompt_state_t;
405
+
406
+ typedef uint64_t (*ompt_get_unique_id_t) (void);
407
+
408
+ typedef uint64_t ompd_size_t;
409
+
410
+ typedef uint64_t ompd_wait_id_t;
411
+
412
+ typedef uint64_t ompd_addr_t;
413
+ typedef int64_t ompd_word_t;
414
+ typedef uint64_t ompd_seg_t;
415
+
416
+ typedef uint64_t ompd_device_t;
417
+
418
+ typedef uint64_t ompd_thread_id_t;
419
+
420
+ typedef enum ompd_scope_t {
421
+ ompd_scope_global = 1,
422
+ ompd_scope_address_space = 2,
423
+ ompd_scope_thread = 3,
424
+ ompd_scope_parallel = 4,
425
+ ompd_scope_implicit_task = 5,
426
+ ompd_scope_task = 6
427
+ } ompd_scope_t;
428
+
429
+ typedef uint64_t ompd_icv_id_t;
430
+
431
+ typedef enum ompd_rc_t {
432
+ ompd_rc_ok = 0,
433
+ ompd_rc_unavailable = 1,
434
+ ompd_rc_stale_handle = 2,
435
+ ompd_rc_bad_input = 3,
436
+ ompd_rc_error = 4,
437
+ ompd_rc_unsupported = 5,
438
+ ompd_rc_needs_state_tracking = 6,
439
+ ompd_rc_incompatible = 7,
440
+ ompd_rc_device_read_error = 8,
441
+ ompd_rc_device_write_error = 9,
442
+ ompd_rc_nomem = 10,
443
+ } ompd_rc_t;
444
+
445
+ typedef void (*ompt_interface_fn_t) (void);
446
+
447
+ typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
448
+ const char *interface_function_name
449
+ );
450
+
451
+ typedef union ompt_data_t {
452
+ uint64_t value;
453
+ void *ptr;
454
+ } ompt_data_t;
455
+
456
+ typedef struct ompt_frame_t {
457
+ ompt_data_t exit_frame;
458
+ ompt_data_t enter_frame;
459
+ int exit_frame_flags;
460
+ int enter_frame_flags;
461
+ } ompt_frame_t;
462
+
463
+ typedef void (*ompt_callback_t) (void);
464
+
465
+ typedef void ompt_device_t;
466
+
467
+ typedef void ompt_buffer_t;
468
+
469
+ typedef void (*ompt_callback_buffer_request_t) (
470
+ int device_num,
471
+ ompt_buffer_t **buffer,
472
+ size_t *bytes
473
+ );
474
+
475
+ typedef void (*ompt_callback_buffer_complete_t) (
476
+ int device_num,
477
+ ompt_buffer_t *buffer,
478
+ size_t bytes,
479
+ ompt_buffer_cursor_t begin,
480
+ int buffer_owned
481
+ );
482
+
483
+ typedef void (*ompt_finalize_t) (
484
+ ompt_data_t *tool_data
485
+ );
486
+
487
+ typedef int (*ompt_initialize_t) (
488
+ ompt_function_lookup_t lookup,
489
+ int initial_device_num,
490
+ ompt_data_t *tool_data
491
+ );
492
+
493
+ typedef struct ompt_start_tool_result_t {
494
+ ompt_initialize_t initialize;
495
+ ompt_finalize_t finalize;
496
+ ompt_data_t tool_data;
497
+ } ompt_start_tool_result_t;
498
+
499
+ typedef struct ompt_record_abstract_t {
500
+ ompt_record_native_t rclass;
501
+ const char *type;
502
+ ompt_device_time_t start_time;
503
+ ompt_device_time_t end_time;
504
+ ompt_hwid_t hwid;
505
+ } ompt_record_abstract_t;
506
+
507
+ typedef struct ompt_dependence_t {
508
+ ompt_data_t variable;
509
+ ompt_dependence_type_t dependence_type;
510
+ } ompt_dependence_t;
511
+
512
+ typedef int (*ompt_enumerate_states_t) (
513
+ int current_state,
514
+ int *next_state,
515
+ const char **next_state_name
516
+ );
517
+
518
+ typedef int (*ompt_enumerate_mutex_impls_t) (
519
+ int current_impl,
520
+ int *next_impl,
521
+ const char **next_impl_name
522
+ );
523
+
524
+ typedef ompt_set_result_t (*ompt_set_callback_t) (
525
+ ompt_callbacks_t event,
526
+ ompt_callback_t callback
527
+ );
528
+
529
+ typedef int (*ompt_get_callback_t) (
530
+ ompt_callbacks_t event,
531
+ ompt_callback_t *callback
532
+ );
533
+
534
+ typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
535
+
536
+ typedef int (*ompt_get_num_procs_t) (void);
537
+
538
+ typedef int (*ompt_get_num_places_t) (void);
539
+
540
+ typedef int (*ompt_get_place_proc_ids_t) (
541
+ int place_num,
542
+ int ids_size,
543
+ int *ids
544
+ );
545
+
546
+ typedef int (*ompt_get_place_num_t) (void);
547
+
548
+ typedef int (*ompt_get_partition_place_nums_t) (
549
+ int place_nums_size,
550
+ int *place_nums
551
+ );
552
+
553
+ typedef int (*ompt_get_proc_id_t) (void);
554
+
555
+ typedef int (*ompt_get_state_t) (
556
+ ompt_wait_id_t *wait_id
557
+ );
558
+
559
+ typedef int (*ompt_get_parallel_info_t) (
560
+ int ancestor_level,
561
+ ompt_data_t **parallel_data,
562
+ int *team_size
563
+ );
564
+
565
+ typedef int (*ompt_get_task_info_t) (
566
+ int ancestor_level,
567
+ int *flags,
568
+ ompt_data_t **task_data,
569
+ ompt_frame_t **task_frame,
570
+ ompt_data_t **parallel_data,
571
+ int *thread_num
572
+ );
573
+
574
+ typedef int (*ompt_get_task_memory_t)(
575
+ void **addr,
576
+ size_t *size,
577
+ int block
578
+ );
579
+
580
+ typedef int (*ompt_get_target_info_t) (
581
+ uint64_t *device_num,
582
+ ompt_id_t *target_id,
583
+ ompt_id_t *host_op_id
584
+ );
585
+
586
+ typedef int (*ompt_get_num_devices_t) (void);
587
+
588
+ typedef void (*ompt_finalize_tool_t) (void);
589
+
590
+ typedef int (*ompt_get_device_num_procs_t) (
591
+ ompt_device_t *device
592
+ );
593
+
594
+ typedef ompt_device_time_t (*ompt_get_device_time_t) (
595
+ ompt_device_t *device
596
+ );
597
+
598
+ typedef double (*ompt_translate_time_t) (
599
+ ompt_device_t *device,
600
+ ompt_device_time_t time
601
+ );
602
+
603
+ typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
604
+ ompt_device_t *device,
605
+ unsigned int enable,
606
+ unsigned int etype
607
+ );
608
+
609
+ typedef ompt_set_result_t (*ompt_set_trace_native_t) (
610
+ ompt_device_t *device,
611
+ int enable,
612
+ int flags
613
+ );
614
+
615
+ typedef int (*ompt_start_trace_t) (
616
+ ompt_device_t *device,
617
+ ompt_callback_buffer_request_t request,
618
+ ompt_callback_buffer_complete_t complete
619
+ );
620
+
621
+ typedef int (*ompt_pause_trace_t) (
622
+ ompt_device_t *device,
623
+ int begin_pause
624
+ );
625
+
626
+ typedef int (*ompt_flush_trace_t) (
627
+ ompt_device_t *device
628
+ );
629
+
630
+ typedef int (*ompt_stop_trace_t) (
631
+ ompt_device_t *device
632
+ );
633
+
634
+ typedef int (*ompt_advance_buffer_cursor_t) (
635
+ ompt_device_t *device,
636
+ ompt_buffer_t *buffer,
637
+ size_t size,
638
+ ompt_buffer_cursor_t current,
639
+ ompt_buffer_cursor_t *next
640
+ );
641
+
642
+ typedef ompt_record_t (*ompt_get_record_type_t) (
643
+ ompt_buffer_t *buffer,
644
+ ompt_buffer_cursor_t current
645
+ );
646
+
647
+ typedef void *(*ompt_get_record_native_t) (
648
+ ompt_buffer_t *buffer,
649
+ ompt_buffer_cursor_t current,
650
+ ompt_id_t *host_op_id
651
+ );
652
+
653
+ typedef ompt_record_abstract_t *
654
+ (*ompt_get_record_abstract_t) (
655
+ void *native_record
656
+ );
657
+
658
+ typedef void (*ompt_callback_thread_begin_t) (
659
+ ompt_thread_t thread_type,
660
+ ompt_data_t *thread_data
661
+ );
662
+
663
+ typedef struct ompt_record_thread_begin_t {
664
+ ompt_thread_t thread_type;
665
+ } ompt_record_thread_begin_t;
666
+
667
+ typedef void (*ompt_callback_thread_end_t) (
668
+ ompt_data_t *thread_data
669
+ );
670
+
671
+ typedef void (*ompt_callback_parallel_begin_t) (
672
+ ompt_data_t *encountering_task_data,
673
+ const ompt_frame_t *encountering_task_frame,
674
+ ompt_data_t *parallel_data,
675
+ unsigned int requested_parallelism,
676
+ int flags,
677
+ const void *codeptr_ra
678
+ );
679
+
680
+ typedef struct ompt_record_parallel_begin_t {
681
+ ompt_id_t encountering_task_id;
682
+ ompt_id_t parallel_id;
683
+ unsigned int requested_parallelism;
684
+ int flags;
685
+ const void *codeptr_ra;
686
+ } ompt_record_parallel_begin_t;
687
+
688
+ typedef void (*ompt_callback_parallel_end_t) (
689
+ ompt_data_t *parallel_data,
690
+ ompt_data_t *encountering_task_data,
691
+ int flags,
692
+ const void *codeptr_ra
693
+ );
694
+
695
+ typedef struct ompt_record_parallel_end_t {
696
+ ompt_id_t parallel_id;
697
+ ompt_id_t encountering_task_id;
698
+ int flags;
699
+ const void *codeptr_ra;
700
+ } ompt_record_parallel_end_t;
701
+
702
+ typedef void (*ompt_callback_work_t) (
703
+ ompt_work_t wstype,
704
+ ompt_scope_endpoint_t endpoint,
705
+ ompt_data_t *parallel_data,
706
+ ompt_data_t *task_data,
707
+ uint64_t count,
708
+ const void *codeptr_ra
709
+ );
710
+
711
+ typedef struct ompt_record_work_t {
712
+ ompt_work_t wstype;
713
+ ompt_scope_endpoint_t endpoint;
714
+ ompt_id_t parallel_id;
715
+ ompt_id_t task_id;
716
+ uint64_t count;
717
+ const void *codeptr_ra;
718
+ } ompt_record_work_t;
719
+
720
+ typedef void (*ompt_callback_dispatch_t) (
721
+ ompt_data_t *parallel_data,
722
+ ompt_data_t *task_data,
723
+ ompt_dispatch_t kind,
724
+ ompt_data_t instance
725
+ );
726
+
727
+ typedef struct ompt_record_dispatch_t {
728
+ ompt_id_t parallel_id;
729
+ ompt_id_t task_id;
730
+ ompt_dispatch_t kind;
731
+ ompt_data_t instance;
732
+ } ompt_record_dispatch_t;
733
+
734
+ typedef void (*ompt_callback_task_create_t) (
735
+ ompt_data_t *encountering_task_data,
736
+ const ompt_frame_t *encountering_task_frame,
737
+ ompt_data_t *new_task_data,
738
+ int flags,
739
+ int has_dependences,
740
+ const void *codeptr_ra
741
+ );
742
+
743
+ typedef struct ompt_record_task_create_t {
744
+ ompt_id_t encountering_task_id;
745
+ ompt_id_t new_task_id;
746
+ int flags;
747
+ int has_dependences;
748
+ const void *codeptr_ra;
749
+ } ompt_record_task_create_t;
750
+
751
+ typedef void (*ompt_callback_dependences_t) (
752
+ ompt_data_t *task_data,
753
+ const ompt_dependence_t *deps,
754
+ int ndeps
755
+ );
756
+
757
+ typedef struct ompt_record_dependences_t {
758
+ ompt_id_t task_id;
759
+ ompt_dependence_t dep;
760
+ int ndeps;
761
+ } ompt_record_dependences_t;
762
+
763
+ typedef void (*ompt_callback_task_dependence_t) (
764
+ ompt_data_t *src_task_data,
765
+ ompt_data_t *sink_task_data
766
+ );
767
+
768
+ typedef struct ompt_record_task_dependence_t {
769
+ ompt_id_t src_task_id;
770
+ ompt_id_t sink_task_id;
771
+ } ompt_record_task_dependence_t;
772
+
773
+ typedef void (*ompt_callback_task_schedule_t) (
774
+ ompt_data_t *prior_task_data,
775
+ ompt_task_status_t prior_task_status,
776
+ ompt_data_t *next_task_data
777
+ );
778
+
779
+ typedef struct ompt_record_task_schedule_t {
780
+ ompt_id_t prior_task_id;
781
+ ompt_task_status_t prior_task_status;
782
+ ompt_id_t next_task_id;
783
+ } ompt_record_task_schedule_t;
784
+
785
+ typedef void (*ompt_callback_implicit_task_t) (
786
+ ompt_scope_endpoint_t endpoint,
787
+ ompt_data_t *parallel_data,
788
+ ompt_data_t *task_data,
789
+ unsigned int actual_parallelism,
790
+ unsigned int index,
791
+ int flags
792
+ );
793
+
794
+ typedef struct ompt_record_implicit_task_t {
795
+ ompt_scope_endpoint_t endpoint;
796
+ ompt_id_t parallel_id;
797
+ ompt_id_t task_id;
798
+ unsigned int actual_parallelism;
799
+ unsigned int index;
800
+ int flags;
801
+ } ompt_record_implicit_task_t;
802
+
803
+ typedef void (*ompt_callback_master_t) (
804
+ ompt_scope_endpoint_t endpoint,
805
+ ompt_data_t *parallel_data,
806
+ ompt_data_t *task_data,
807
+ const void *codeptr_ra
808
+ );
809
+
810
+ typedef struct ompt_record_master_t {
811
+ ompt_scope_endpoint_t endpoint;
812
+ ompt_id_t parallel_id;
813
+ ompt_id_t task_id;
814
+ const void *codeptr_ra;
815
+ } ompt_record_master_t;
816
+
817
+ typedef void (*ompt_callback_sync_region_t) (
818
+ ompt_sync_region_t kind,
819
+ ompt_scope_endpoint_t endpoint,
820
+ ompt_data_t *parallel_data,
821
+ ompt_data_t *task_data,
822
+ const void *codeptr_ra
823
+ );
824
+
825
+ typedef struct ompt_record_sync_region_t {
826
+ ompt_sync_region_t kind;
827
+ ompt_scope_endpoint_t endpoint;
828
+ ompt_id_t parallel_id;
829
+ ompt_id_t task_id;
830
+ const void *codeptr_ra;
831
+ } ompt_record_sync_region_t;
832
+
833
+ typedef void (*ompt_callback_mutex_acquire_t) (
834
+ ompt_mutex_t kind,
835
+ unsigned int hint,
836
+ unsigned int impl,
837
+ ompt_wait_id_t wait_id,
838
+ const void *codeptr_ra
839
+ );
840
+
841
+ typedef struct ompt_record_mutex_acquire_t {
842
+ ompt_mutex_t kind;
843
+ unsigned int hint;
844
+ unsigned int impl;
845
+ ompt_wait_id_t wait_id;
846
+ const void *codeptr_ra;
847
+ } ompt_record_mutex_acquire_t;
848
+
849
+ typedef void (*ompt_callback_mutex_t) (
850
+ ompt_mutex_t kind,
851
+ ompt_wait_id_t wait_id,
852
+ const void *codeptr_ra
853
+ );
854
+
855
+ typedef struct ompt_record_mutex_t {
856
+ ompt_mutex_t kind;
857
+ ompt_wait_id_t wait_id;
858
+ const void *codeptr_ra;
859
+ } ompt_record_mutex_t;
860
+
861
+ typedef void (*ompt_callback_nest_lock_t) (
862
+ ompt_scope_endpoint_t endpoint,
863
+ ompt_wait_id_t wait_id,
864
+ const void *codeptr_ra
865
+ );
866
+
867
+ typedef struct ompt_record_nest_lock_t {
868
+ ompt_scope_endpoint_t endpoint;
869
+ ompt_wait_id_t wait_id;
870
+ const void *codeptr_ra;
871
+ } ompt_record_nest_lock_t;
872
+
873
+ typedef void (*ompt_callback_flush_t) (
874
+ ompt_data_t *thread_data,
875
+ const void *codeptr_ra
876
+ );
877
+
878
+ typedef struct ompt_record_flush_t {
879
+ const void *codeptr_ra;
880
+ } ompt_record_flush_t;
881
+
882
+ typedef void (*ompt_callback_cancel_t) (
883
+ ompt_data_t *task_data,
884
+ int flags,
885
+ const void *codeptr_ra
886
+ );
887
+
888
+ typedef struct ompt_record_cancel_t {
889
+ ompt_id_t task_id;
890
+ int flags;
891
+ const void *codeptr_ra;
892
+ } ompt_record_cancel_t;
893
+
894
+ typedef void (*ompt_callback_device_initialize_t) (
895
+ int device_num,
896
+ const char *type,
897
+ ompt_device_t *device,
898
+ ompt_function_lookup_t lookup,
899
+ const char *documentation
900
+ );
901
+
902
+ typedef void (*ompt_callback_device_finalize_t) (
903
+ int device_num
904
+ );
905
+
906
+ typedef void (*ompt_callback_device_load_t) (
907
+ int device_num,
908
+ const char *filename,
909
+ int64_t offset_in_file,
910
+ void *vma_in_file,
911
+ size_t bytes,
912
+ void *host_addr,
913
+ void *device_addr,
914
+ uint64_t module_id
915
+ );
916
+
917
+ typedef void (*ompt_callback_device_unload_t) (
918
+ int device_num,
919
+ uint64_t module_id
920
+ );
921
+
922
+ typedef void (*ompt_callback_target_data_op_t) (
923
+ ompt_id_t target_id,
924
+ ompt_id_t host_op_id,
925
+ ompt_target_data_op_t optype,
926
+ void *src_addr,
927
+ int src_device_num,
928
+ void *dest_addr,
929
+ int dest_device_num,
930
+ size_t bytes,
931
+ const void *codeptr_ra
932
+ );
933
+
934
+ typedef struct ompt_record_target_data_op_t {
935
+ ompt_id_t host_op_id;
936
+ ompt_target_data_op_t optype;
937
+ void *src_addr;
938
+ int src_device_num;
939
+ void *dest_addr;
940
+ int dest_device_num;
941
+ size_t bytes;
942
+ ompt_device_time_t end_time;
943
+ const void *codeptr_ra;
944
+ } ompt_record_target_data_op_t;
945
+
946
+ typedef void (*ompt_callback_target_t) (
947
+ ompt_target_t kind,
948
+ ompt_scope_endpoint_t endpoint,
949
+ int device_num,
950
+ ompt_data_t *task_data,
951
+ ompt_id_t target_id,
952
+ const void *codeptr_ra
953
+ );
954
+
955
+ typedef struct ompt_record_target_t {
956
+ ompt_target_t kind;
957
+ ompt_scope_endpoint_t endpoint;
958
+ int device_num;
959
+ ompt_id_t task_id;
960
+ ompt_id_t target_id;
961
+ const void *codeptr_ra;
962
+ } ompt_record_target_t;
963
+
964
+ typedef void (*ompt_callback_target_map_t) (
965
+ ompt_id_t target_id,
966
+ unsigned int nitems,
967
+ void **host_addr,
968
+ void **device_addr,
969
+ size_t *bytes,
970
+ unsigned int *mapping_flags,
971
+ const void *codeptr_ra
972
+ );
973
+
974
+ typedef struct ompt_record_target_map_t {
975
+ ompt_id_t target_id;
976
+ unsigned int nitems;
977
+ void **host_addr;
978
+ void **device_addr;
979
+ size_t *bytes;
980
+ unsigned int *mapping_flags;
981
+ const void *codeptr_ra;
982
+ } ompt_record_target_map_t;
983
+
984
+ typedef void (*ompt_callback_target_submit_t) (
985
+ ompt_id_t target_id,
986
+ ompt_id_t host_op_id,
987
+ unsigned int requested_num_teams
988
+ );
989
+
990
+ typedef struct ompt_record_target_kernel_t {
991
+ ompt_id_t host_op_id;
992
+ unsigned int requested_num_teams;
993
+ unsigned int granted_num_teams;
994
+ ompt_device_time_t end_time;
995
+ } ompt_record_target_kernel_t;
996
+
997
+ typedef int (*ompt_callback_control_tool_t) (
998
+ uint64_t command,
999
+ uint64_t modifier,
1000
+ void *arg,
1001
+ const void *codeptr_ra
1002
+ );
1003
+
1004
+ typedef struct ompt_record_control_tool_t {
1005
+ uint64_t command;
1006
+ uint64_t modifier;
1007
+ const void *codeptr_ra;
1008
+ } ompt_record_control_tool_t;
1009
+
1010
+ typedef struct ompd_address_t {
1011
+ ompd_seg_t segment;
1012
+ ompd_addr_t address;
1013
+ } ompd_address_t;
1014
+
1015
+ typedef struct ompd_frame_info_t {
1016
+ ompd_address_t frame_address;
1017
+ ompd_word_t frame_flag;
1018
+ } ompd_frame_info_t;
1019
+
1020
+ typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
1021
+ typedef struct _ompd_thread_handle ompd_thread_handle_t;
1022
+ typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
1023
+ typedef struct _ompd_task_handle ompd_task_handle_t;
1024
+
1025
+ typedef struct _ompd_aspace_cont ompd_address_space_context_t;
1026
+ typedef struct _ompd_thread_cont ompd_thread_context_t;
1027
+
1028
+ typedef struct ompd_device_type_sizes_t {
1029
+ uint8_t sizeof_char;
1030
+ uint8_t sizeof_short;
1031
+ uint8_t sizeof_int;
1032
+ uint8_t sizeof_long;
1033
+ uint8_t sizeof_long_long;
1034
+ uint8_t sizeof_pointer;
1035
+ } ompd_device_type_sizes_t;
1036
+
1037
+ typedef struct ompt_record_ompt_t {
1038
+ ompt_callbacks_t type;
1039
+ ompt_device_time_t time;
1040
+ ompt_id_t thread_id;
1041
+ ompt_id_t target_id;
1042
+ union {
1043
+ ompt_record_thread_begin_t thread_begin;
1044
+ ompt_record_parallel_begin_t parallel_begin;
1045
+ ompt_record_parallel_end_t parallel_end;
1046
+ ompt_record_work_t work;
1047
+ ompt_record_dispatch_t dispatch;
1048
+ ompt_record_task_create_t task_create;
1049
+ ompt_record_dependences_t dependences;
1050
+ ompt_record_task_dependence_t task_dependence;
1051
+ ompt_record_task_schedule_t task_schedule;
1052
+ ompt_record_implicit_task_t implicit_task;
1053
+ ompt_record_master_t master;
1054
+ ompt_record_sync_region_t sync_region;
1055
+ ompt_record_mutex_acquire_t mutex_acquire;
1056
+ ompt_record_mutex_t mutex;
1057
+ ompt_record_nest_lock_t nest_lock;
1058
+ ompt_record_flush_t flush;
1059
+ ompt_record_cancel_t cancel;
1060
+ ompt_record_target_t target;
1061
+ ompt_record_target_data_op_t target_data_op;
1062
+ ompt_record_target_map_t target_map;
1063
+ ompt_record_target_kernel_t target_kernel;
1064
+ ompt_record_control_tool_t control_tool;
1065
+ } record;
1066
+ } ompt_record_ompt_t;
1067
+
1068
+ typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
1069
+ ompt_buffer_t *buffer,
1070
+ ompt_buffer_cursor_t current
1071
+ );
1072
+
1073
+ #define ompt_id_none 0
1074
+ #define ompt_data_none {0}
1075
+ #define ompt_time_none 0
1076
+ #define ompt_hwid_none 0
1077
+ #define ompt_addr_none ~0
1078
+ #define ompt_mutex_impl_none 0
1079
+ #define ompt_wait_id_none 0
1080
+
1081
+ #define ompd_segment_none 0
1082
+
1083
+ #endif /* __OMPT__ */
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/channel_descriptor.h ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CHANNEL_DESCRIPTOR_H__)
51
+ #define __CHANNEL_DESCRIPTOR_H__
52
+
53
+ #if defined(__cplusplus)
54
+
55
+ /*******************************************************************************
56
+ * *
57
+ * *
58
+ * *
59
+ *******************************************************************************/
60
+
61
+ #include "cuda_runtime_api.h"
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ /**
70
+ * \addtogroup CUDART_HIGHLEVEL
71
+ *
72
+ * @{
73
+ */
74
+
75
+ /**
76
+ * \brief \hl Returns a channel descriptor using the specified format
77
+ *
78
+ * Returns a channel descriptor with format \p f and number of bits of each
79
+ * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
80
+ * defined as:
81
+ * \code
82
+ struct cudaChannelFormatDesc {
83
+ int x, y, z, w;
84
+ enum cudaChannelFormatKind f;
85
+ };
86
+ * \endcode
87
+ *
88
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
89
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
90
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
91
+ * ::cudaChannelFormatKindSignedNormalized8X4,
92
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
93
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
94
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
95
+ * ::cudaChannelFormatKindSignedNormalized16X4,
96
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
97
+ * ::cudaChannelFormatKindUnsignedNormalized16X4
98
+ * or ::cudaChannelFormatKindNV12.
99
+ *
100
+ * The format is specified by the template specialization.
101
+ *
102
+ * The template function specializes for the following scalar types:
103
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
104
+ * The template function specializes for the following vector types:
105
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
106
+ * The template function specializes for following cudaChannelFormatKind enum values:
107
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
108
+ *
109
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
110
+ *
111
+ * \return
112
+ * Channel descriptor with format \p f
113
+ *
114
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
115
+ * ::cudaGetChannelDesc,
116
+ */
117
+ template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
118
+ {
119
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
120
+ }
121
+
122
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
123
+ {
124
+ int e = (int)sizeof(unsigned short) * 8;
125
+
126
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
127
+ }
128
+
129
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
130
+ {
131
+ int e = (int)sizeof(unsigned short) * 8;
132
+
133
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
134
+ }
135
+
136
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
137
+ {
138
+ int e = (int)sizeof(unsigned short) * 8;
139
+
140
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
141
+ }
142
+
143
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
144
+ {
145
+ int e = (int)sizeof(unsigned short) * 8;
146
+
147
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
148
+ }
149
+
150
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
151
+ {
152
+ int e = (int)sizeof(char) * 8;
153
+
154
+ #if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
155
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
156
+ #else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
157
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
158
+ #endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
159
+ }
160
+
161
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
162
+ {
163
+ int e = (int)sizeof(signed char) * 8;
164
+
165
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
166
+ }
167
+
168
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
169
+ {
170
+ int e = (int)sizeof(unsigned char) * 8;
171
+
172
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
173
+ }
174
+
175
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
176
+ {
177
+ int e = (int)sizeof(signed char) * 8;
178
+
179
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
180
+ }
181
+
182
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
183
+ {
184
+ int e = (int)sizeof(unsigned char) * 8;
185
+
186
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
187
+ }
188
+
189
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
190
+ {
191
+ int e = (int)sizeof(signed char) * 8;
192
+
193
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
194
+ }
195
+
196
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
197
+ {
198
+ int e = (int)sizeof(unsigned char) * 8;
199
+
200
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
201
+ }
202
+
203
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
204
+ {
205
+ int e = (int)sizeof(signed char) * 8;
206
+
207
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
208
+ }
209
+
210
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
211
+ {
212
+ int e = (int)sizeof(unsigned char) * 8;
213
+
214
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
215
+ }
216
+
217
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
218
+ {
219
+ int e = (int)sizeof(short) * 8;
220
+
221
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
222
+ }
223
+
224
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
225
+ {
226
+ int e = (int)sizeof(unsigned short) * 8;
227
+
228
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
229
+ }
230
+
231
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
232
+ {
233
+ int e = (int)sizeof(short) * 8;
234
+
235
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
236
+ }
237
+
238
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
239
+ {
240
+ int e = (int)sizeof(unsigned short) * 8;
241
+
242
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
243
+ }
244
+
245
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
246
+ {
247
+ int e = (int)sizeof(short) * 8;
248
+
249
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
250
+ }
251
+
252
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
253
+ {
254
+ int e = (int)sizeof(unsigned short) * 8;
255
+
256
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
257
+ }
258
+
259
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
260
+ {
261
+ int e = (int)sizeof(short) * 8;
262
+
263
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
264
+ }
265
+
266
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
267
+ {
268
+ int e = (int)sizeof(unsigned short) * 8;
269
+
270
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
271
+ }
272
+
273
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
274
+ {
275
+ int e = (int)sizeof(int) * 8;
276
+
277
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
278
+ }
279
+
280
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
281
+ {
282
+ int e = (int)sizeof(unsigned int) * 8;
283
+
284
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
285
+ }
286
+
287
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
288
+ {
289
+ int e = (int)sizeof(int) * 8;
290
+
291
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
292
+ }
293
+
294
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
295
+ {
296
+ int e = (int)sizeof(unsigned int) * 8;
297
+
298
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
299
+ }
300
+
301
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
302
+ {
303
+ int e = (int)sizeof(int) * 8;
304
+
305
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
306
+ }
307
+
308
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
309
+ {
310
+ int e = (int)sizeof(unsigned int) * 8;
311
+
312
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
313
+ }
314
+
315
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
316
+ {
317
+ int e = (int)sizeof(int) * 8;
318
+
319
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
320
+ }
321
+
322
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
323
+ {
324
+ int e = (int)sizeof(unsigned int) * 8;
325
+
326
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
327
+ }
328
+
329
+ #if !defined(__LP64__)
330
+
331
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
332
+ {
333
+ int e = (int)sizeof(long) * 8;
334
+
335
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
336
+ }
337
+
338
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
339
+ {
340
+ int e = (int)sizeof(unsigned long) * 8;
341
+
342
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
343
+ }
344
+
345
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
346
+ {
347
+ int e = (int)sizeof(long) * 8;
348
+
349
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
350
+ }
351
+
352
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
353
+ {
354
+ int e = (int)sizeof(unsigned long) * 8;
355
+
356
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
357
+ }
358
+
359
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
360
+ {
361
+ int e = (int)sizeof(long) * 8;
362
+
363
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
364
+ }
365
+
366
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
367
+ {
368
+ int e = (int)sizeof(unsigned long) * 8;
369
+
370
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
371
+ }
372
+
373
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
374
+ {
375
+ int e = (int)sizeof(long) * 8;
376
+
377
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
378
+ }
379
+
380
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
381
+ {
382
+ int e = (int)sizeof(unsigned long) * 8;
383
+
384
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
385
+ }
386
+
387
+ #endif /* !__LP64__ */
388
+
389
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
390
+ {
391
+ int e = (int)sizeof(float) * 8;
392
+
393
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
394
+ }
395
+
396
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
397
+ {
398
+ int e = (int)sizeof(float) * 8;
399
+
400
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
401
+ }
402
+
403
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
404
+ {
405
+ int e = (int)sizeof(float) * 8;
406
+
407
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
408
+ }
409
+
410
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
411
+ {
412
+ int e = (int)sizeof(float) * 8;
413
+
414
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
415
+ }
416
+
417
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
418
+ {
419
+ int e = (int)sizeof(char) * 8;
420
+
421
+ return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
422
+ }
423
+
424
+ template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
425
+ {
426
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
427
+ }
428
+
429
+ /* Signed 8-bit normalized integer formats */
430
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
431
+ {
432
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
433
+ }
434
+
435
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
436
+ {
437
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
438
+ }
439
+
440
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
441
+ {
442
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
443
+ }
444
+
445
+ /* Unsigned 8-bit normalized integer formats */
446
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
447
+ {
448
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
449
+ }
450
+
451
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
452
+ {
453
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
454
+ }
455
+
456
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
457
+ {
458
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
459
+ }
460
+
461
+ /* Signed 16-bit normalized integer formats */
462
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
463
+ {
464
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
465
+ }
466
+
467
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
468
+ {
469
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
470
+ }
471
+
472
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
473
+ {
474
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
475
+ }
476
+
477
+ /* Unsigned 16-bit normalized integer formats */
478
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
479
+ {
480
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
481
+ }
482
+
483
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
484
+ {
485
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
486
+ }
487
+
488
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
489
+ {
490
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
491
+ }
492
+
493
+ /* NV12 format */
494
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
495
+ {
496
+ return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
497
+ }
498
+
499
+ /* BC1 format */
500
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
501
+ {
502
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
503
+ }
504
+
505
+ /* BC1sRGB format */
506
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
507
+ {
508
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
509
+ }
510
+
511
+ /* BC2 format */
512
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
513
+ {
514
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
515
+ }
516
+
517
+ /* BC2sRGB format */
518
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
519
+ {
520
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
521
+ }
522
+
523
+ /* BC3 format */
524
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
525
+ {
526
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
527
+ }
528
+
529
+ /* BC3sRGB format */
530
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
531
+ {
532
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
533
+ }
534
+
535
+ /* BC4 unsigned format */
536
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
537
+ {
538
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
539
+ }
540
+
541
+ /* BC4 signed format */
542
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
543
+ {
544
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
545
+ }
546
+
547
+ /* BC5 unsigned format */
548
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
549
+ {
550
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
551
+ }
552
+
553
+ /* BC5 signed format */
554
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
555
+ {
556
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
557
+ }
558
+
559
+ /* BC6H unsigned format */
560
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
561
+ {
562
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
563
+ }
564
+
565
+ /* BC6H signed format */
566
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
567
+ {
568
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
569
+ }
570
+
571
+ /* BC7 format */
572
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
573
+ {
574
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
575
+ }
576
+
577
+ /* BC7sRGB format */
578
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
579
+ {
580
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
581
+ }
582
+
583
+ #endif /* __cplusplus */
584
+
585
+ /** @} */
586
+ /** @} */ /* END CUDART_TEXTURE_HL */
587
+
588
+ #endif /* !__CHANNEL_DESCRIPTOR_H__ */
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups.h ADDED
@@ -0,0 +1,1730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _COOPERATIVE_GROUPS_H_
51
+ #define _COOPERATIVE_GROUPS_H_
52
+
53
+ #if defined(__cplusplus) && defined(__CUDACC__)
54
+
55
+ #include "cooperative_groups/details/info.h"
56
+ #include "cooperative_groups/details/driver_abi.h"
57
+ #include "cooperative_groups/details/helpers.h"
58
+ #include "cooperative_groups/details/memory.h"
59
+
60
+ #if defined(_CG_HAS_STL_ATOMICS)
61
+ #include <cuda/atomic>
62
+ #define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
63
+ #else
64
+ #define _CG_THREAD_SCOPE(scope)
65
+ #endif
66
+
67
+ _CG_BEGIN_NAMESPACE
68
+
69
+ namespace details {
70
+ _CG_CONST_DECL unsigned int coalesced_group_id = 1;
71
+ _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
72
+ _CG_CONST_DECL unsigned int grid_group_id = 3;
73
+ _CG_CONST_DECL unsigned int thread_block_id = 4;
74
+ _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
75
+ _CG_CONST_DECL unsigned int cluster_group_id = 6;
76
+ }
77
+
78
+ /**
79
+ * class thread_group;
80
+ *
81
+ * Generic thread group type, into which all groups are convertible.
82
+ * It acts as a container for all storage necessary for the derived groups,
83
+ * and will dispatch the API calls to the correct derived group. This means
84
+ * that all derived groups must implement the same interface as thread_group.
85
+ */
86
+ class thread_group
87
+ {
88
+ protected:
89
+ struct group_data {
90
+ unsigned int _unused : 1;
91
+ unsigned int type : 7, : 0;
92
+ };
93
+
94
+ struct gg_data {
95
+ details::grid_workspace *gridWs;
96
+ };
97
+
98
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
99
+ struct mg_data {
100
+ unsigned long long _unused : 1;
101
+ unsigned long long type : 7;
102
+ unsigned long long handle : 56;
103
+ const details::multi_grid::multi_grid_functions *functions;
104
+ };
105
+ #endif
106
+
107
+ struct tg_data {
108
+ unsigned int is_tiled : 1;
109
+ unsigned int type : 7;
110
+ unsigned int size : 24;
111
+ // packed to 4b
112
+ unsigned int metaGroupSize : 16;
113
+ unsigned int metaGroupRank : 16;
114
+ // packed to 8b
115
+ unsigned int mask;
116
+ // packed to 12b
117
+ unsigned int _res;
118
+ };
119
+
120
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
121
+ friend class thread_block;
122
+
123
+ union __align__(8) {
124
+ group_data group;
125
+ tg_data coalesced;
126
+ gg_data grid;
127
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
128
+ mg_data multi_grid;
129
+ #endif
130
+ } _data;
131
+
132
+ _CG_QUALIFIER thread_group operator=(const thread_group& src);
133
+
134
+ _CG_QUALIFIER thread_group(unsigned int type) {
135
+ _data.group.type = type;
136
+ _data.group._unused = false;
137
+ }
138
+
139
+ #ifdef _CG_CPP11_FEATURES
140
+ static_assert(sizeof(tg_data) <= 16, "Failed size check");
141
+ static_assert(sizeof(gg_data) <= 16, "Failed size check");
142
+ # ifdef _CG_ABI_EXPERIMENTAL
143
+ static_assert(sizeof(mg_data) <= 16, "Failed size check");
144
+ # endif
145
+ #endif
146
+
147
+ public:
148
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
149
+
150
+ _CG_QUALIFIER unsigned long long size() const;
151
+ _CG_QUALIFIER unsigned long long num_threads() const;
152
+ _CG_QUALIFIER unsigned long long thread_rank() const;
153
+ _CG_QUALIFIER void sync() const;
154
+ _CG_QUALIFIER unsigned int get_type() const {
155
+ return _data.group.type;
156
+ }
157
+
158
+ };
159
+
160
+ template <unsigned int TyId>
161
+ struct thread_group_base : public thread_group {
162
+ _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
163
+ _CG_STATIC_CONST_DECL unsigned int id = TyId;
164
+ };
165
+
166
+ #if defined(_CG_HAS_MULTI_GRID_GROUP)
167
+
168
+ /**
169
+ * class multi_grid_group;
170
+ *
171
+ * Threads within this this group are guaranteed to be co-resident on the
172
+ * same system, on multiple devices within the same launched kernels.
173
+ * To use this group, the kernel must have been launched with
174
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
175
+ * and the device must support it (queryable device attribute).
176
+ *
177
+ * Constructed via this_multi_grid();
178
+ */
179
+
180
+
181
+ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
182
+ class multi_grid_group;
183
+
184
+ // Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
185
+ template <typename = void>
186
+ __device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
187
+
188
+ class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
189
+ {
190
+ private:
191
+ template <typename = void>
192
+ _CG_QUALIFIER multi_grid_group() {
193
+ _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
194
+ _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
195
+ }
196
+
197
+ friend multi_grid_group this_multi_grid<void>();
198
+
199
+ public:
200
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
201
+
202
+ _CG_QUALIFIER bool is_valid() const {
203
+ return (_data.multi_grid.handle != 0);
204
+ }
205
+
206
+ _CG_QUALIFIER void sync() const {
207
+ if (!is_valid()) {
208
+ _CG_ABORT();
209
+ }
210
+ _data.multi_grid.functions->sync(_data.multi_grid.handle);
211
+ }
212
+
213
+ _CG_QUALIFIER unsigned long long num_threads() const {
214
+ _CG_ASSERT(is_valid());
215
+ return _data.multi_grid.functions->size(_data.multi_grid.handle);
216
+ }
217
+
218
+ _CG_QUALIFIER unsigned long long size() const {
219
+ return num_threads();
220
+ }
221
+
222
+ _CG_QUALIFIER unsigned long long thread_rank() const {
223
+ _CG_ASSERT(is_valid());
224
+ return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
225
+ }
226
+
227
+ _CG_QUALIFIER unsigned int grid_rank() const {
228
+ _CG_ASSERT(is_valid());
229
+ return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
230
+ }
231
+
232
+ _CG_QUALIFIER unsigned int num_grids() const {
233
+ _CG_ASSERT(is_valid());
234
+ return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
235
+ }
236
+ };
237
+ # else
238
+ class multi_grid_group
239
+ {
240
+ private:
241
+ unsigned long long _handle;
242
+ unsigned int _size;
243
+ unsigned int _rank;
244
+
245
+ friend _CG_QUALIFIER multi_grid_group this_multi_grid();
246
+
247
+ _CG_QUALIFIER multi_grid_group() {
248
+ _handle = details::multi_grid::get_intrinsic_handle();
249
+ _size = details::multi_grid::size(_handle);
250
+ _rank = details::multi_grid::thread_rank(_handle);
251
+ }
252
+
253
+ public:
254
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
255
+
256
+ _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
257
+ return (_handle != 0);
258
+ }
259
+
260
+ _CG_QUALIFIER _CG_DEPRECATED void sync() const {
261
+ if (!is_valid()) {
262
+ _CG_ABORT();
263
+ }
264
+ details::multi_grid::sync(_handle);
265
+ }
266
+
267
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
268
+ _CG_ASSERT(is_valid());
269
+ return _size;
270
+ }
271
+
272
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
273
+ return num_threads();
274
+ }
275
+
276
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
277
+ _CG_ASSERT(is_valid());
278
+ return _rank;
279
+ }
280
+
281
+ _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
282
+ _CG_ASSERT(is_valid());
283
+ return (details::multi_grid::grid_rank(_handle));
284
+ }
285
+
286
+ _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
287
+ _CG_ASSERT(is_valid());
288
+ return (details::multi_grid::num_grids(_handle));
289
+ }
290
+ };
291
+ # endif
292
+
293
+ /**
294
+ * multi_grid_group this_multi_grid()
295
+ *
296
+ * Constructs a multi_grid_group
297
+ */
298
+ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
299
+ template <typename>
300
+ __device__
301
+ #else
302
+ _CG_QUALIFIER
303
+ # endif
304
+ _CG_DEPRECATED
305
+ multi_grid_group this_multi_grid()
306
+ {
307
+ return multi_grid_group();
308
+ }
309
+ #endif
310
+
311
+ /**
312
+ * class grid_group;
313
+ *
314
+ * Threads within this this group are guaranteed to be co-resident on the
315
+ * same device within the same launched kernel. To use this group, the kernel
316
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
317
+ * and the device must support it (queryable device attribute).
318
+ *
319
+ * Constructed via this_grid();
320
+ */
321
+ class grid_group : public thread_group_base<details::grid_group_id>
322
+ {
323
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
324
+ friend _CG_QUALIFIER grid_group this_grid();
325
+
326
+ private:
327
+ _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
328
+ _data.grid.gridWs = gridWs;
329
+ }
330
+
331
+ public:
332
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
333
+
334
+ _CG_QUALIFIER bool is_valid() const {
335
+ return (_data.grid.gridWs != NULL);
336
+ }
337
+
338
+ _CG_QUALIFIER void sync() const {
339
+ if (!is_valid()) {
340
+ _CG_ABORT();
341
+ }
342
+ details::grid::sync(&_data.grid.gridWs->barrier);
343
+ }
344
+
345
+ #if defined(_CG_CPP11_FEATURES)
346
+ using arrival_token = unsigned int;
347
+
348
+ _CG_QUALIFIER arrival_token barrier_arrive() const {
349
+ if (!is_valid()) {
350
+ _CG_ABORT();
351
+ }
352
+ return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
353
+ }
354
+
355
+ _CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
356
+ details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
357
+ }
358
+ #endif
359
+
360
+ _CG_STATIC_QUALIFIER unsigned long long size() {
361
+ return details::grid::size();
362
+ }
363
+
364
+ _CG_STATIC_QUALIFIER dim3 group_dim() {
365
+ return details::grid::grid_dim();
366
+ }
367
+
368
+ _CG_STATIC_QUALIFIER dim3 dim_threads() {
369
+ return details::grid::dim_threads();
370
+ }
371
+
372
+ _CG_STATIC_QUALIFIER unsigned long long num_threads() {
373
+ return details::grid::num_threads();
374
+ }
375
+
376
+ _CG_STATIC_QUALIFIER dim3 thread_index() {
377
+ return details::grid::thread_index();
378
+ }
379
+
380
+ _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
381
+ return details::grid::thread_rank();
382
+ }
383
+
384
+ _CG_STATIC_QUALIFIER dim3 dim_blocks() {
385
+ return details::grid::dim_blocks();
386
+ }
387
+
388
+ _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
389
+ return details::grid::num_blocks();
390
+ }
391
+
392
+ _CG_STATIC_QUALIFIER dim3 block_index() {
393
+ return details::grid::block_index();
394
+ }
395
+
396
+ _CG_STATIC_QUALIFIER unsigned long long block_rank() {
397
+ return details::grid::block_rank();
398
+ }
399
+
400
+ # if defined(_CG_HAS_CLUSTER_GROUP)
401
+ _CG_STATIC_QUALIFIER dim3 dim_clusters() {
402
+ return details::grid::dim_clusters();
403
+ }
404
+
405
+ _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
406
+ return details::grid::num_clusters();
407
+ }
408
+
409
+ _CG_STATIC_QUALIFIER dim3 cluster_index() {
410
+ return details::grid::cluster_index();
411
+ }
412
+
413
+ _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
414
+ return details::grid::cluster_rank();
415
+ }
416
+ # endif
417
+ };
418
+
419
+ _CG_QUALIFIER grid_group this_grid() {
420
+ // Load a workspace from the driver
421
+ grid_group gg(details::get_grid_workspace());
422
+ #ifdef _CG_DEBUG
423
+ // *all* threads must be available to synchronize
424
+ gg.sync();
425
+ #endif // _CG_DEBUG
426
+ return gg;
427
+ }
428
+
429
+ #if defined(_CG_HAS_CLUSTER_GROUP)
430
+ /**
431
+ * class cluster_group
432
+ *
433
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
434
+ * divided along all dimensions to form groups of blocks, each group of which is
435
+ * a block cluster. Clustered grids are subject to various restrictions and
436
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
437
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
438
+ * grids are subject to additional occupancy limitations due to per-cluster
439
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
440
+ * be a cooperative group, with access to all cooperative group capabilities, as
441
+ * well as cluster specific capabilities and accelerations. A cluster_group
442
+ * represents a block cluster.
443
+ *
444
+ * Constructed via this_cluster_group();
445
+ */
446
+ class cluster_group : public thread_group_base<details::cluster_group_id>
447
+ {
448
+ // Friends
449
+ friend _CG_QUALIFIER cluster_group this_cluster();
450
+
451
+ // Disable constructor
452
+ _CG_QUALIFIER cluster_group()
453
+ {
454
+ }
455
+
456
+ public:
457
+ //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
458
+
459
+ using arrival_token = struct {};
460
+
461
+ // Functionality exposed by the group
462
+ _CG_STATIC_QUALIFIER void sync()
463
+ {
464
+ return details::cluster::sync();
465
+ }
466
+
467
+ _CG_STATIC_QUALIFIER arrival_token barrier_arrive()
468
+ {
469
+ details::cluster::barrier_arrive();
470
+ return arrival_token();
471
+ }
472
+
473
+ _CG_STATIC_QUALIFIER void barrier_wait()
474
+ {
475
+ return details::cluster::barrier_wait();
476
+ }
477
+
478
+ _CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
479
+ {
480
+ return details::cluster::barrier_wait();
481
+ }
482
+
483
+ _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
484
+ {
485
+ return details::cluster::query_shared_rank(addr);
486
+ }
487
+
488
+ template <typename T>
489
+ _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
490
+ {
491
+ return details::cluster::map_shared_rank(addr, rank);
492
+ }
493
+
494
+ _CG_STATIC_QUALIFIER dim3 block_index()
495
+ {
496
+ return details::cluster::block_index();
497
+ }
498
+
499
+ _CG_STATIC_QUALIFIER unsigned int block_rank()
500
+ {
501
+ return details::cluster::block_rank();
502
+ }
503
+
504
+ _CG_STATIC_QUALIFIER dim3 thread_index()
505
+ {
506
+ return details::cluster::thread_index();
507
+ }
508
+
509
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
510
+ {
511
+ return details::cluster::thread_rank();
512
+ }
513
+
514
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
515
+ {
516
+ return details::cluster::dim_blocks();
517
+ }
518
+
519
+ _CG_STATIC_QUALIFIER unsigned int num_blocks()
520
+ {
521
+ return details::cluster::num_blocks();
522
+ }
523
+
524
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
525
+ {
526
+ return details::cluster::dim_threads();
527
+ }
528
+
529
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
530
+ {
531
+ return details::cluster::num_threads();
532
+ }
533
+
534
+ // Legacy aliases
535
+ _CG_STATIC_QUALIFIER unsigned int size()
536
+ {
537
+ return num_threads();
538
+ }
539
+ };
540
+
541
+ /*
542
+ * cluster_group this_cluster()
543
+ *
544
+ * Constructs a cluster_group
545
+ */
546
+ _CG_QUALIFIER cluster_group this_cluster()
547
+ {
548
+ cluster_group cg;
549
+ #ifdef _CG_DEBUG
550
+ cg.sync();
551
+ #endif
552
+ return cg;
553
+ }
554
+ #endif
555
+
556
+ #if defined(_CG_CPP11_FEATURES)
557
+ class thread_block;
558
+ template <unsigned int MaxBlockSize>
559
+ _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
560
+ #endif
561
+
562
+ /**
563
+ * class thread_block
564
+ *
565
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
566
+ * each block are guaranteed to reside on the same streaming multiprocessor.
567
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
568
+ *
569
+ * Constructed via this_thread_block();
570
+ */
571
+ class thread_block : public thread_group_base<details::thread_block_id>
572
+ {
573
+ // Friends
574
+ friend _CG_QUALIFIER thread_block this_thread_block();
575
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
576
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
577
+
578
+ #if defined(_CG_CPP11_FEATURES)
579
+ template <unsigned int MaxBlockSize>
580
+ friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
581
+ template <unsigned int Size>
582
+ friend class __static_size_multi_warp_tile_base;
583
+
584
+ details::multi_warp_scratch* const tile_memory;
585
+
586
+ template <unsigned int MaxBlockSize>
587
+ _CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
588
+ tile_memory(details::get_scratch_ptr(&scratch)) {
589
+ #ifdef _CG_DEBUG
590
+ if (num_threads() > MaxBlockSize) {
591
+ details::abort();
592
+ }
593
+ #endif
594
+ #if !defined(_CG_HAS_RESERVED_SHARED)
595
+ tile_memory->init_barriers(thread_rank());
596
+ sync();
597
+ #endif
598
+ }
599
+ #endif
600
+
601
+ // Disable constructor
602
+ _CG_QUALIFIER thread_block()
603
+ #if defined(_CG_CPP11_FEATURES)
604
+ : tile_memory(details::get_scratch_ptr(NULL))
605
+ #endif
606
+ { }
607
+
608
+ // Internal Use
609
+ _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
610
+ const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
611
+
612
+ // Invalid, immediately fail
613
+ if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
614
+ details::abort();
615
+ return (thread_block());
616
+ }
617
+
618
+ unsigned int mask;
619
+ unsigned int base_offset = thread_rank() & (~(tilesz - 1));
620
+ unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
621
+
622
+ mask = (unsigned int)(-1) >> (32 - masklength);
623
+ mask <<= (details::laneid() & ~(tilesz - 1));
624
+ thread_group tile = thread_group(details::coalesced_group_id);
625
+ tile._data.coalesced.mask = mask;
626
+ tile._data.coalesced.size = __popc(mask);
627
+ tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
628
+ tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
629
+ tile._data.coalesced.is_tiled = true;
630
+ return (tile);
631
+ }
632
+
633
+ public:
634
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
635
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
636
+
637
+ _CG_STATIC_QUALIFIER void sync() {
638
+ details::cta::sync();
639
+ }
640
+
641
+ #if defined(_CG_CPP11_FEATURES)
642
+ struct arrival_token {};
643
+
644
+ _CG_QUALIFIER arrival_token barrier_arrive() const {
645
+ return arrival_token();
646
+ }
647
+
648
+ _CG_QUALIFIER void barrier_wait(arrival_token&&) const {
649
+ details::cta::sync();
650
+ }
651
+ #endif
652
+
653
+ _CG_STATIC_QUALIFIER unsigned int size() {
654
+ return details::cta::size();
655
+ }
656
+
657
+ _CG_STATIC_QUALIFIER unsigned int thread_rank() {
658
+ return details::cta::thread_rank();
659
+ }
660
+
661
+ // Additional functionality exposed by the group
662
+ _CG_STATIC_QUALIFIER dim3 group_index() {
663
+ return details::cta::group_index();
664
+ }
665
+
666
+ _CG_STATIC_QUALIFIER dim3 thread_index() {
667
+ return details::cta::thread_index();
668
+ }
669
+
670
+ _CG_STATIC_QUALIFIER dim3 group_dim() {
671
+ return details::cta::block_dim();
672
+ }
673
+
674
+ _CG_STATIC_QUALIFIER dim3 dim_threads() {
675
+ return details::cta::dim_threads();
676
+ }
677
+
678
+ _CG_STATIC_QUALIFIER unsigned int num_threads() {
679
+ return details::cta::num_threads();
680
+ }
681
+
682
+ };
683
+
684
+ /**
685
+ * thread_block this_thread_block()
686
+ *
687
+ * Constructs a thread_block group
688
+ */
689
+ _CG_QUALIFIER thread_block this_thread_block()
690
+ {
691
+ return (thread_block());
692
+ }
693
+
694
+ #if defined(_CG_CPP11_FEATURES)
695
+ template <unsigned int MaxBlockSize>
696
+ _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
697
+ return (thread_block(scratch));
698
+ }
699
+ #endif
700
+
701
+ /**
702
+ * class coalesced_group
703
+ *
704
+ * A group representing the current set of converged threads in a warp.
705
+ * The size of the group is not guaranteed and it may return a group of
706
+ * only one thread (itself).
707
+ *
708
+ * This group exposes warp-synchronous builtins.
709
+ * Constructed via coalesced_threads();
710
+ */
711
+ class coalesced_group : public thread_group_base<details::coalesced_group_id>
712
+ {
713
+ private:
714
+ friend _CG_QUALIFIER coalesced_group coalesced_threads();
715
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
716
+ friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
717
+ friend class details::_coalesced_group_data_access;
718
+
719
+ _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
720
+ unsigned int member_pack = 0;
721
+ unsigned int member_rank = 0;
722
+ for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
723
+ unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
724
+ if (lane_bit) {
725
+ if (laneMask & lane_bit)
726
+ member_pack |= 1 << member_rank;
727
+ member_rank++;
728
+ }
729
+ }
730
+ return (member_pack);
731
+ }
732
+
733
+ // Internal Use
734
+ _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
735
+ const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
736
+
737
+ // Invalid, immediately fail
738
+ if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
739
+ details::abort();
740
+ return (coalesced_group(0));
741
+ }
742
+ if (size() <= tilesz) {
743
+ return (*this);
744
+ }
745
+
746
+ if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
747
+ unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
748
+ unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
749
+ unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
750
+
751
+ mask <<= (details::laneid() & ~(tilesz - 1));
752
+ coalesced_group coalesced_tile = coalesced_group(mask);
753
+ coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
754
+ coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
755
+ coalesced_tile._data.coalesced.is_tiled = true;
756
+ return (coalesced_tile);
757
+ }
758
+ else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
759
+ unsigned int mask = 0;
760
+ unsigned int member_rank = 0;
761
+ int seen_lanes = (thread_rank() / tilesz) * tilesz;
762
+ for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
763
+ unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
764
+ if (lane_bit) {
765
+ if (seen_lanes <= 0 && member_rank < tilesz) {
766
+ mask |= lane_bit;
767
+ member_rank++;
768
+ }
769
+ seen_lanes--;
770
+ }
771
+ }
772
+ coalesced_group coalesced_tile = coalesced_group(mask);
773
+ // Override parent with the size of this group
774
+ coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
775
+ coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
776
+ return coalesced_tile;
777
+ }
778
+ else {
779
+ // None in _CG_VERSION 1000
780
+ details::abort();
781
+ }
782
+
783
+ return (coalesced_group(0));
784
+ }
785
+
786
+ protected:
787
+ _CG_QUALIFIER coalesced_group(unsigned int mask) {
788
+ _data.coalesced.mask = mask;
789
+ _data.coalesced.size = __popc(mask);
790
+ _data.coalesced.metaGroupRank = 0;
791
+ _data.coalesced.metaGroupSize = 1;
792
+ _data.coalesced.is_tiled = false;
793
+ }
794
+
795
+ _CG_QUALIFIER unsigned int get_mask() const {
796
+ return (_data.coalesced.mask);
797
+ }
798
+
799
+ public:
800
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
801
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
802
+
803
+ _CG_QUALIFIER unsigned int num_threads() const {
804
+ return _data.coalesced.size;
805
+ }
806
+
807
+ _CG_QUALIFIER unsigned int size() const {
808
+ return num_threads();
809
+ }
810
+
811
+ _CG_QUALIFIER unsigned int thread_rank() const {
812
+ return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
813
+ }
814
+
815
+ // Rank of this group in the upper level of the hierarchy
816
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
817
+ return _data.coalesced.metaGroupRank;
818
+ }
819
+
820
+ // Total num partitions created out of all CTAs when the group was created
821
+ _CG_QUALIFIER unsigned int meta_group_size() const {
822
+ return _data.coalesced.metaGroupSize;
823
+ }
824
+
825
+ _CG_QUALIFIER void sync() const {
826
+ __syncwarp(_data.coalesced.mask);
827
+ }
828
+
829
+ #ifdef _CG_CPP11_FEATURES
830
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
831
+ _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
832
+ unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
833
+ (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
834
+
835
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
836
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
837
+ }
838
+
839
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
840
+ _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
841
+ if (size() == 32) {
842
+ return details::tile::shuffle_dispatch<TyElem>::shfl_down(
843
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
844
+ }
845
+
846
+ unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
847
+
848
+ if (lane >= 32)
849
+ lane = details::laneid();
850
+
851
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
852
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
853
+ }
854
+
855
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
856
+ _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
857
+ if (size() == 32) {
858
+ return details::tile::shuffle_dispatch<TyElem>::shfl_up(
859
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
860
+ }
861
+
862
+ unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
863
+ if (lane >= 32)
864
+ lane = details::laneid();
865
+
866
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
867
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
868
+ }
869
+ #else
870
+ template <typename TyIntegral>
871
+ _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
872
+ details::assert_if_not_arithmetic<TyIntegral>();
873
+ unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
874
+ (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
875
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
876
+ }
877
+
878
+ template <typename TyIntegral>
879
+ _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
880
+ details::assert_if_not_arithmetic<TyIntegral>();
881
+ if (size() == 32) {
882
+ return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
883
+ }
884
+ unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
885
+ if (lane >= 32) lane = details::laneid();
886
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
887
+ }
888
+
889
+ template <typename TyIntegral>
890
+ _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
891
+ details::assert_if_not_arithmetic<TyIntegral>();
892
+ if (size() == 32) {
893
+ return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
894
+ }
895
+ unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
896
+ if (lane >= 32) lane = details::laneid();
897
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
898
+ }
899
+ #endif
900
+
901
+ _CG_QUALIFIER int any(int predicate) const {
902
+ return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
903
+ }
904
+ _CG_QUALIFIER int all(int predicate) const {
905
+ return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
906
+ }
907
+ _CG_QUALIFIER unsigned int ballot(int predicate) const {
908
+ if (size() == 32) {
909
+ return (__ballot_sync(0xFFFFFFFF, predicate));
910
+ }
911
+ unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
912
+ return (_packLanes(lane_ballot));
913
+ }
914
+
915
+ #ifdef _CG_HAS_MATCH_COLLECTIVE
916
+
917
+ template <typename TyIntegral>
918
+ _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
919
+ details::assert_if_not_arithmetic<TyIntegral>();
920
+ if (size() == 32) {
921
+ return (__match_any_sync(0xFFFFFFFF, val));
922
+ }
923
+ unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
924
+ return (_packLanes(lane_match));
925
+ }
926
+
927
+ template <typename TyIntegral>
928
+ _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
929
+ details::assert_if_not_arithmetic<TyIntegral>();
930
+ if (size() == 32) {
931
+ return (__match_all_sync(0xFFFFFFFF, val, &pred));
932
+ }
933
+ unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
934
+ return (_packLanes(lane_match));
935
+ }
936
+
937
+ #endif /* !_CG_HAS_MATCH_COLLECTIVE */
938
+
939
+ };
940
+
941
+ _CG_QUALIFIER coalesced_group coalesced_threads()
942
+ {
943
+ return (coalesced_group(__activemask()));
944
+ }
945
+
946
+ namespace details {
947
+ template <unsigned int Size> struct verify_thread_block_tile_size;
948
+ template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
949
+ template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
950
+ template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
951
+ template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
952
+ template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
953
+ template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
954
+
955
+ #ifdef _CG_CPP11_FEATURES
956
+ template <unsigned int Size>
957
+ using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
958
+
959
+ template <unsigned int Size>
960
+ using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
961
+ template <unsigned int Size>
962
+ using _is_multi_warp =
963
+ _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
964
+
965
+ template <unsigned int Size>
966
+ using _is_valid_single_warp_tile =
967
+ _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
968
+ template <unsigned int Size>
969
+ using _is_valid_multi_warp_tile =
970
+ _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
971
+ #else
972
+ template <unsigned int Size>
973
+ struct _is_multi_warp {
974
+ static const bool value = false;
975
+ };
976
+ #endif
977
+ }
978
+
979
+ template <unsigned int Size>
980
+ class __static_size_tile_base
981
+ {
982
+ protected:
983
+ _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
984
+
985
+ public:
986
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
987
+
988
+ // Rank of thread within tile
989
+ _CG_STATIC_QUALIFIER unsigned int thread_rank() {
990
+ return (details::cta::thread_rank() & (numThreads - 1));
991
+ }
992
+
993
+ // Number of threads within tile
994
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
995
+ return numThreads;
996
+ }
997
+
998
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
999
+ return num_threads();
1000
+ }
1001
+ };
1002
+
1003
+ template <unsigned int Size>
1004
+ class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
1005
+ {
1006
+ friend class details::_coalesced_group_data_access;
1007
+ typedef details::tile::tile_helpers<Size> th;
1008
+
1009
+ #ifdef _CG_CPP11_FEATURES
1010
+ static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
1011
+ #else
1012
+ typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
1013
+ #endif
1014
+ using __static_size_tile_base<Size>::numThreads;
1015
+ _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
1016
+
1017
+ protected:
1018
+ _CG_STATIC_QUALIFIER unsigned int build_mask() {
1019
+ unsigned int mask = fullMask;
1020
+ if (numThreads != 32) {
1021
+ // [0,31] representing the current active thread in the warp
1022
+ unsigned int laneId = details::laneid();
1023
+ // shift mask according to the partition it belongs to
1024
+ mask = th::tileMask << (laneId & ~(th::laneMask));
1025
+ }
1026
+ return (mask);
1027
+ }
1028
+
1029
+ public:
1030
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
1031
+
1032
+ _CG_STATIC_QUALIFIER void sync() {
1033
+ __syncwarp(build_mask());
1034
+ }
1035
+
1036
+ #ifdef _CG_CPP11_FEATURES
1037
+ // PTX supported collectives
1038
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1039
+ _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
1040
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
1041
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
1042
+ }
1043
+
1044
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1045
+ _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
1046
+ return details::tile::shuffle_dispatch<TyElem>::shfl_down(
1047
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
1048
+ }
1049
+
1050
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1051
+ _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
1052
+ return details::tile::shuffle_dispatch<TyElem>::shfl_up(
1053
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
1054
+ }
1055
+
1056
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1057
+ _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
1058
+ return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
1059
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
1060
+ }
1061
+ #else
1062
+ template <typename TyIntegral>
1063
+ _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
1064
+ details::assert_if_not_arithmetic<TyIntegral>();
1065
+ return (__shfl_sync(build_mask(), var, srcRank, numThreads));
1066
+ }
1067
+
1068
+ template <typename TyIntegral>
1069
+ _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
1070
+ details::assert_if_not_arithmetic<TyIntegral>();
1071
+ return (__shfl_down_sync(build_mask(), var, delta, numThreads));
1072
+ }
1073
+
1074
+ template <typename TyIntegral>
1075
+ _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
1076
+ details::assert_if_not_arithmetic<TyIntegral>();
1077
+ return (__shfl_up_sync(build_mask(), var, delta, numThreads));
1078
+ }
1079
+
1080
+ template <typename TyIntegral>
1081
+ _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
1082
+ details::assert_if_not_arithmetic<TyIntegral>();
1083
+ return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
1084
+ }
1085
+ #endif //_CG_CPP11_FEATURES
1086
+
1087
+ _CG_QUALIFIER int any(int predicate) const {
1088
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1089
+ return (lane_ballot != 0);
1090
+ }
1091
+ _CG_QUALIFIER int all(int predicate) const {
1092
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1093
+ return (lane_ballot == build_mask());
1094
+ }
1095
+ _CG_QUALIFIER unsigned int ballot(int predicate) const {
1096
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1097
+ return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
1098
+ }
1099
+
1100
+ #ifdef _CG_HAS_MATCH_COLLECTIVE
1101
+ template <typename TyIntegral>
1102
+ _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
1103
+ details::assert_if_not_arithmetic<TyIntegral>();
1104
+ unsigned int lane_match = __match_any_sync(build_mask(), val);
1105
+ return (lane_match >> (details::laneid() & (~(th::laneMask))));
1106
+ }
1107
+
1108
+ template <typename TyIntegral>
1109
+ _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
1110
+ details::assert_if_not_arithmetic<TyIntegral>();
1111
+ unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
1112
+ return (lane_match >> (details::laneid() & (~(th::laneMask))));
1113
+ }
1114
+ #endif
1115
+
1116
+ };
1117
+
1118
+ template <unsigned int Size, typename ParentT>
1119
+ class __static_parent_thread_block_tile_base
1120
+ {
1121
+ public:
1122
+ // Rank of this group in the upper level of the hierarchy
1123
+ _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
1124
+ return ParentT::thread_rank() / Size;
1125
+ }
1126
+
1127
+ // Total num partitions created out of all CTAs when the group was created
1128
+ _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
1129
+ return (ParentT::size() + Size - 1) / Size;
1130
+ }
1131
+ };
1132
+
1133
+ /**
1134
+ * class thread_block_tile<unsigned int Size, ParentT = void>
1135
+ *
1136
+ * Statically-sized group type, representing one tile of a thread block.
1137
+ * The only specializations currently supported are those with native
1138
+ * hardware support (1/2/4/8/16/32)
1139
+ *
1140
+ * This group exposes warp-synchronous builtins.
1141
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
1142
+ */
1143
+
1144
+ template <unsigned int Size, typename ParentT = void>
1145
+ class __single_warp_thread_block_tile :
1146
+ public __static_size_thread_block_tile_base<Size>,
1147
+ public __static_parent_thread_block_tile_base<Size, ParentT>
1148
+ {
1149
+ typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
1150
+ friend class details::_coalesced_group_data_access;
1151
+
1152
+ protected:
1153
+ _CG_QUALIFIER __single_warp_thread_block_tile() { };
1154
+ _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
1155
+
1156
+ _CG_STATIC_QUALIFIER unsigned int get_mask() {
1157
+ return __static_size_thread_block_tile_base<Size>::build_mask();
1158
+ }
1159
+ };
1160
+
1161
+ template <unsigned int Size>
1162
+ class __single_warp_thread_block_tile<Size, void> :
1163
+ public __static_size_thread_block_tile_base<Size>,
1164
+ public thread_group_base<details::coalesced_group_id>
1165
+ {
1166
+ _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
1167
+
1168
+ template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
1169
+ friend class details::_coalesced_group_data_access;
1170
+
1171
+ typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
1172
+
1173
+ protected:
1174
+ _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
1175
+ _data.coalesced.mask = staticSizeBaseT::build_mask();
1176
+ _data.coalesced.size = numThreads;
1177
+ _data.coalesced.metaGroupRank = meta_group_rank;
1178
+ _data.coalesced.metaGroupSize = meta_group_size;
1179
+ _data.coalesced.is_tiled = true;
1180
+ }
1181
+
1182
+ _CG_QUALIFIER unsigned int get_mask() const {
1183
+ return (_data.coalesced.mask);
1184
+ }
1185
+
1186
+ public:
1187
+ using staticSizeBaseT::sync;
1188
+ using staticSizeBaseT::size;
1189
+ using staticSizeBaseT::num_threads;
1190
+ using staticSizeBaseT::thread_rank;
1191
+
1192
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
1193
+ return _data.coalesced.metaGroupRank;
1194
+ }
1195
+
1196
+ _CG_QUALIFIER unsigned int meta_group_size() const {
1197
+ return _data.coalesced.metaGroupSize;
1198
+ }
1199
+ };
1200
+
1201
+ /**
1202
+ * Outer level API calls
1203
+ * void sync(GroupT) - see <group_type>.sync()
1204
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
1205
+ * void group_size(GroupT) - see <group_type>.size()
1206
+ */
1207
+ template <class GroupT>
1208
+ _CG_QUALIFIER void sync(GroupT const &g)
1209
+ {
1210
+ g.sync();
1211
+ }
1212
+
1213
+ // TODO: Use a static dispatch to determine appropriate return type
1214
+ // C++03 is stuck with unsigned long long for now
1215
+ #ifdef _CG_CPP11_FEATURES
1216
+ template <class GroupT>
1217
+ _CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
1218
+ return g.thread_rank();
1219
+ }
1220
+
1221
+
1222
+ template <class GroupT>
1223
+ _CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
1224
+ return g.num_threads();
1225
+ }
1226
+ #else
1227
+ template <class GroupT>
1228
+ _CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
1229
+ return static_cast<unsigned long long>(g.thread_rank());
1230
+ }
1231
+
1232
+
1233
+ template <class GroupT>
1234
+ _CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
1235
+ return static_cast<unsigned long long>(g.num_threads());
1236
+ }
1237
+ #endif
1238
+
1239
+
1240
+ /**
1241
+ * tiled_partition
1242
+ *
1243
+ * The tiled_partition(parent, tilesz) method is a collective operation that
1244
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
1245
+ *
1246
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
1247
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
1248
+ * will be members of the same subgroup.
1249
+ *
1250
+ * The implementation may cause the calling thread to wait until all the members
1251
+ * of the parent group have invoked the operation before resuming execution.
1252
+ *
1253
+ * Functionality is limited to power-of-two sized subgorup instances of at most
1254
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
1255
+ * tiled_partition() in _CG_VERSION 1000.
1256
+ */
1257
+ _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
1258
+ {
1259
+ if (parent.get_type() == details::coalesced_group_id) {
1260
+ const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
1261
+ return _cg->_get_tiled_threads(tilesz);
1262
+ }
1263
+ else {
1264
+ const thread_block *_tb = static_cast<const thread_block*>(&parent);
1265
+ return _tb->_get_tiled_threads(tilesz);
1266
+ }
1267
+ }
1268
+
1269
+ // Thread block type overload: returns a basic thread_group for now (may be specialized later)
1270
+ _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
1271
+ {
1272
+ return (parent._get_tiled_threads(tilesz));
1273
+ }
1274
+
1275
+ // Coalesced group type overload: retains its ability to stay coalesced
1276
+ _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
1277
+ {
1278
+ return (parent._get_tiled_threads(tilesz));
1279
+ }
1280
+
1281
+ namespace details {
1282
+ template <unsigned int Size, typename ParentT>
1283
+ class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
1284
+
1285
+ template <unsigned int Size, typename ParentT>
1286
+ _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
1287
+ return internal_thread_block_tile<Size, ParentT>();
1288
+ }
1289
+
1290
+ template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
1291
+ _CG_QUALIFIER TyVal multi_warp_collectives_helper(
1292
+ const GroupT& group,
1293
+ WarpLambda warp_lambda,
1294
+ InterWarpLambda inter_warp_lambda) {
1295
+ return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
1296
+ }
1297
+
1298
+ template <typename T, typename GroupT>
1299
+ _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
1300
+ return group.template get_scratch_location<T>(warp_id);
1301
+ }
1302
+
1303
+ template <typename GroupT>
1304
+ _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
1305
+ return group.get_sync_location();
1306
+ }
1307
+
1308
+ }
1309
+ /**
1310
+ * tiled_partition<tilesz>
1311
+ *
1312
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
1313
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
1314
+ *
1315
+ * A total of ((size(parent)/tilesz) subgroups will be created,
1316
+ * therefore the parent group size must be evenly divisible by the tilesz.
1317
+ * The allow parent groups are thread_block or thread_block_tile<size>.
1318
+ *
1319
+ * The implementation may cause the calling thread to wait until all the members
1320
+ * of the parent group have invoked the operation before resuming execution.
1321
+ *
1322
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
1323
+ * The size(parent) must be greater than the template Size parameter
1324
+ * otherwise the results are undefined.
1325
+ */
1326
+
1327
+ #if defined(_CG_CPP11_FEATURES)
1328
+ template <unsigned int Size>
1329
+ class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
1330
+ {
1331
+ static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
1332
+
1333
+ template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
1334
+ friend __device__ TyVal details::multi_warp_collectives_helper(
1335
+ const GroupT& group,
1336
+ WarpLambda warp_lambda,
1337
+ InterWarpLambda inter_warp_lambda);
1338
+ template <typename T, typename GroupT>
1339
+ friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
1340
+ template <typename GroupT>
1341
+ friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
1342
+ template <unsigned int OtherSize>
1343
+ friend class __static_size_multi_warp_tile_base;
1344
+ using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
1345
+ using ThisType = __static_size_multi_warp_tile_base<Size>;
1346
+ _CG_STATIC_CONST_DECL int numWarps = Size / 32;
1347
+
1348
+ protected:
1349
+ details::multi_warp_scratch* const tile_memory;
1350
+
1351
+ template <typename GroupT>
1352
+ _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
1353
+ #if defined(_CG_HAS_RESERVED_SHARED)
1354
+ details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
1355
+ g.sync();
1356
+ #endif
1357
+ }
1358
+
1359
+
1360
+ private:
1361
+ _CG_QUALIFIER details::barrier_t* get_sync_location() const {
1362
+ // Different group sizes use different barriers, all groups of a given size share one barrier.
1363
+ unsigned int sync_id = details::log2(Size / 64);
1364
+ return &tile_memory->barriers[sync_id];
1365
+ }
1366
+
1367
+ template <typename T>
1368
+ _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
1369
+ unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
1370
+ return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
1371
+ }
1372
+
1373
+ template <typename T>
1374
+ _CG_QUALIFIER T* get_scratch_location() const {
1375
+ unsigned int scratch_id = details::cta::thread_rank() / 32;
1376
+ return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
1377
+ }
1378
+
1379
+ template <typename TyVal>
1380
+ _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
1381
+ unsigned int src_warp = src / 32;
1382
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1383
+ details::barrier_t* sync_location = get_sync_location();
1384
+
1385
+ // Get warp slot of the source threads warp.
1386
+ TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
1387
+
1388
+ if (warp.meta_group_rank() == src_warp) {
1389
+ warp.sync();
1390
+ // Put shuffled value into my warp slot and let my warp arrive at the barrier.
1391
+ if (thread_rank() == src) {
1392
+ *warp_scratch_location = val;
1393
+ }
1394
+ details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
1395
+ TyVal result = *warp_scratch_location;
1396
+ details::sync_warps_wait(sync_location, details::cta::thread_rank());
1397
+ return result;
1398
+ }
1399
+ else {
1400
+ // Wait for the source warp to arrive on the barrier.
1401
+ details::sync_warps_wait_for_specific_warp(sync_location,
1402
+ (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
1403
+ TyVal result = *warp_scratch_location;
1404
+ details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
1405
+ return result;
1406
+ }
1407
+ }
1408
+
1409
+ template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
1410
+ _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
1411
+ static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
1412
+ "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
1413
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1414
+ details::barrier_t* sync_location = get_sync_location();
1415
+ TyVal* warp_scratch_location = get_scratch_location<TyVal>();
1416
+
1417
+ warp_lambda(warp, warp_scratch_location);
1418
+
1419
+ if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
1420
+ auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
1421
+ if (subwarp.meta_group_rank() == 0) {
1422
+ TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
1423
+ inter_warp_lambda(subwarp, thread_scratch_location);
1424
+ }
1425
+ warp.sync();
1426
+ details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
1427
+ }
1428
+ TyVal result = *warp_scratch_location;
1429
+ return result;
1430
+ }
1431
+
1432
+ public:
1433
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
1434
+
1435
+ using __static_size_tile_base<Size>::thread_rank;
1436
+
1437
+ template <typename TyVal>
1438
+ _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
1439
+ static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
1440
+ "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
1441
+ return shfl_impl(val, src);
1442
+ }
1443
+
1444
+ _CG_QUALIFIER void sync() const {
1445
+ details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
1446
+ }
1447
+
1448
+ _CG_QUALIFIER int any(int predicate) const {
1449
+ auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
1450
+ *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
1451
+ };
1452
+ auto inter_warp_lambda =
1453
+ [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
1454
+ *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
1455
+ };
1456
+ return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
1457
+ }
1458
+
1459
+ _CG_QUALIFIER int all(int predicate) const {
1460
+ auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
1461
+ *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
1462
+ };
1463
+ auto inter_warp_lambda =
1464
+ [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
1465
+ *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
1466
+ };
1467
+ return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
1468
+ }
1469
+ };
1470
+
1471
+
1472
+ template <unsigned int Size, typename ParentT = void>
1473
+ class __multi_warp_thread_block_tile :
1474
+ public __static_size_multi_warp_tile_base<Size>,
1475
+ public __static_parent_thread_block_tile_base<Size, ParentT>
1476
+ {
1477
+ typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
1478
+ typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
1479
+ protected:
1480
+ _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
1481
+ __static_size_multi_warp_tile_base<Size>(g) {}
1482
+ };
1483
+
1484
+ template <unsigned int Size>
1485
+ class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
1486
+ {
1487
+ const unsigned int metaGroupRank;
1488
+ const unsigned int metaGroupSize;
1489
+
1490
+ protected:
1491
+ template <unsigned int OtherSize, typename ParentT>
1492
+ _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
1493
+ __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
1494
+
1495
+ public:
1496
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
1497
+ return metaGroupRank;
1498
+ }
1499
+
1500
+ _CG_QUALIFIER unsigned int meta_group_size() const {
1501
+ return metaGroupSize;
1502
+ }
1503
+ };
1504
+ #endif
1505
+
1506
+ template <unsigned int Size, typename ParentT = void>
1507
+ class thread_block_tile;
1508
+
1509
+ namespace details {
1510
+ template <unsigned int Size, typename ParentT, bool IsMultiWarp>
1511
+ class thread_block_tile_impl;
1512
+
1513
+ template <unsigned int Size, typename ParentT>
1514
+ class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
1515
+ {
1516
+ protected:
1517
+ template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
1518
+ _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
1519
+ __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
1520
+
1521
+ _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
1522
+ __single_warp_thread_block_tile<Size, ParentT>() {}
1523
+ };
1524
+
1525
+ #if defined(_CG_CPP11_FEATURES)
1526
+ template <unsigned int Size, typename ParentT>
1527
+ class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
1528
+ {
1529
+ protected:
1530
+ template <typename GroupT>
1531
+ _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
1532
+ __multi_warp_thread_block_tile<Size, ParentT>(g) {}
1533
+ };
1534
+ #else
1535
+ template <unsigned int Size, typename ParentT>
1536
+ class thread_block_tile_impl<Size, ParentT, true>
1537
+ {
1538
+ protected:
1539
+ template <typename GroupT>
1540
+ _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
1541
+ };
1542
+ #endif
1543
+ }
1544
+
1545
+ template <unsigned int Size, typename ParentT>
1546
+ class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
1547
+ {
1548
+ friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
1549
+
1550
+ protected:
1551
+ _CG_QUALIFIER thread_block_tile(const ParentT& g) :
1552
+ details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
1553
+
1554
+ public:
1555
+ _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
1556
+ return thread_block_tile<Size, void>(*this);
1557
+ }
1558
+ };
1559
+
1560
+ template <unsigned int Size>
1561
+ class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
1562
+ {
1563
+ template <unsigned int, typename ParentT>
1564
+ friend class thread_block_tile;
1565
+
1566
+ protected:
1567
+ template <unsigned int OtherSize, typename OtherParentT>
1568
+ _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
1569
+ details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
1570
+
1571
+ public:
1572
+ template <typename ParentT>
1573
+ _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
1574
+ details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
1575
+ };
1576
+
1577
+ namespace details {
1578
+ template <unsigned int Size, typename ParentT>
1579
+ struct tiled_partition_impl;
1580
+
1581
+ template <unsigned int Size>
1582
+ struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
1583
+ _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
1584
+ thread_block_tile<Size, thread_block>(g) {}
1585
+ };
1586
+
1587
+ // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
1588
+ template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
1589
+ struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
1590
+ public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
1591
+ #ifdef _CG_CPP11_FEATURES
1592
+ static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
1593
+ #endif
1594
+ _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
1595
+ thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
1596
+ };
1597
+
1598
+ }
1599
+
1600
+ template <unsigned int Size, typename ParentT>
1601
+ _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
1602
+ {
1603
+ return details::tiled_partition_impl<Size, ParentT>(g);
1604
+ }
1605
+
1606
+ /**
1607
+ * thread_group this_thread()
1608
+ *
1609
+ * Constructs a generic thread_group containing only the calling thread
1610
+ */
1611
+ _CG_QUALIFIER thread_block_tile<1, void> this_thread()
1612
+ {
1613
+ // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
1614
+ // meta group rank and size set to 0 and 1 respectively.
1615
+ return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
1616
+ }
1617
+
1618
+ /**
1619
+ * <group_type>.sync()
1620
+ *
1621
+ * Executes a barrier across the group
1622
+ *
1623
+ * Implements both a compiler fence and an architectural fence to prevent,
1624
+ * memory reordering around the barrier.
1625
+ */
1626
+ _CG_QUALIFIER void thread_group::sync() const
1627
+ {
1628
+ switch (_data.group.type) {
1629
+ case details::coalesced_group_id:
1630
+ cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
1631
+ break;
1632
+ case details::thread_block_id:
1633
+ cooperative_groups::sync(*static_cast<const thread_block*>(this));
1634
+ break;
1635
+ case details::grid_group_id:
1636
+ cooperative_groups::sync(*static_cast<const grid_group*>(this));
1637
+ break;
1638
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1639
+ case details::multi_grid_group_id:
1640
+ cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
1641
+ break;
1642
+ #endif
1643
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1644
+ case details::cluster_group_id:
1645
+ cooperative_groups::sync(*static_cast<const cluster_group*>(this));
1646
+ break;
1647
+ #endif
1648
+ default:
1649
+ break;
1650
+ }
1651
+ }
1652
+
1653
+ /**
1654
+ * <group_type>.size()
1655
+ *
1656
+ * Returns the total number of threads in the group.
1657
+ */
1658
+ _CG_QUALIFIER unsigned long long thread_group::size() const
1659
+ {
1660
+ unsigned long long size = 0;
1661
+ switch (_data.group.type) {
1662
+ case details::coalesced_group_id:
1663
+ size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
1664
+ break;
1665
+ case details::thread_block_id:
1666
+ size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
1667
+ break;
1668
+ case details::grid_group_id:
1669
+ size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
1670
+ break;
1671
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1672
+ case details::multi_grid_group_id:
1673
+ size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
1674
+ break;
1675
+ #endif
1676
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1677
+ case details::cluster_group_id:
1678
+ size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
1679
+ break;
1680
+ #endif
1681
+ default:
1682
+ break;
1683
+ }
1684
+ return size;
1685
+ }
1686
+
1687
+ /**
1688
+ * <group_type>.thread_rank()
1689
+ *
1690
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
1691
+ */
1692
+ _CG_QUALIFIER unsigned long long thread_group::thread_rank() const
1693
+ {
1694
+ unsigned long long rank = 0;
1695
+ switch (_data.group.type) {
1696
+ case details::coalesced_group_id:
1697
+ rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
1698
+ break;
1699
+ case details::thread_block_id:
1700
+ rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
1701
+ break;
1702
+ case details::grid_group_id:
1703
+ rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
1704
+ break;
1705
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1706
+ case details::multi_grid_group_id:
1707
+ rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
1708
+ break;
1709
+ #endif
1710
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1711
+ case details::cluster_group_id:
1712
+ rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
1713
+ break;
1714
+ #endif
1715
+ default:
1716
+ break;
1717
+ }
1718
+ return rank;
1719
+ }
1720
+
1721
+ _CG_END_NAMESPACE
1722
+
1723
+ #include <cooperative_groups/details/partitioning.h>
1724
+ #if (!defined(_MSC_VER) || defined(_WIN64))
1725
+ # include <cooperative_groups/details/invoke.h>
1726
+ #endif
1727
+
1728
+ # endif /* ! (__cplusplus, __CUDACC__) */
1729
+
1730
+ #endif /* !_COOPERATIVE_GROUPS_H_ */
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/async.h ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_ASYNC_H
50
+ #define _CG_ASYNC_H
51
+
52
+ #include "helpers.h"
53
+ #include "info.h"
54
+
55
+ #include <cuda_pipeline.h>
56
+
57
+ _CG_BEGIN_NAMESPACE
58
+
59
+ namespace details {
60
+ // Groups supported by memcpy_async
61
+ template <class TyGroup>
62
+ struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
63
+
64
+ template <unsigned int Sz, typename TyPar>
65
+ struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
66
+ : public _CG_STL_NAMESPACE::true_type {};
67
+ template <>
68
+ struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
69
+ template <>
70
+ struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
71
+
72
+ template <class TyGroup>
73
+ using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
74
+
75
+ // Groups that require optimization
76
+ template <class TyGroup>
77
+ struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
78
+
79
+ template <typename TyPar>
80
+ struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
81
+ : public _CG_STL_NAMESPACE::false_type {};
82
+
83
+ template <unsigned int Sz, typename TyPar>
84
+ struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
85
+ : public _CG_STL_NAMESPACE::true_type {};
86
+
87
+ template <class TyGroup>
88
+ using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
89
+
90
+ // SFINAE helpers for tile optimizations
91
+ template <class TyGroup>
92
+ using enable_tile_optimization =
93
+ typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
94
+
95
+ template <class TyGroup>
96
+ using disable_tile_optimization =
97
+ typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
98
+
99
+ // Segment for punning to aligned types
100
+ template <unsigned int N>
101
+ struct _Segment {
102
+ int _seg[N];
103
+ };
104
+
105
+ // Trivial layout guaranteed-aligned copy-async compatible segments
106
+ template <unsigned int N>
107
+ struct Segment;
108
+ template <>
109
+ struct __align__(4) Segment<1> : public _Segment<1>{};
110
+ template <>
111
+ struct __align__(8) Segment<2> : public _Segment<2>{};
112
+ template <>
113
+ struct __align__(16) Segment<4> : public _Segment<4>{};
114
+
115
+ // Interleaved element by element copies from source to dest
116
+ template <typename TyGroup, typename TyElem>
117
+ _CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
118
+ size_t count) {
119
+ const unsigned int rank = group.thread_rank();
120
+ const unsigned int stride = group.size();
121
+
122
+ for (size_t idx = rank; idx < count; idx += stride) {
123
+ dst[idx] = src[idx];
124
+ }
125
+ }
126
+
127
+ template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
128
+ _CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
129
+ const TyElem *__restrict__ src, size_t count) {
130
+ static_assert(async_copy_group_supported<TyGroup>::value,
131
+ "Async copy is only supported for groups that represent private shared memory");
132
+
133
+ if (count == 0) {
134
+ return;
135
+ }
136
+
137
+ const bool dstIsNotShared = !__isShared(dst);
138
+ const bool srcIsNotGlobal = !__isGlobal(src);
139
+
140
+ if (dstIsNotShared || srcIsNotGlobal) {
141
+ inline_copy(group, dst, src, count);
142
+ return;
143
+ }
144
+
145
+ const unsigned int stride = group.size();
146
+ const unsigned int rank = group.thread_rank();
147
+ // Efficient copies require warps to operate on the same amount of work at each step.
148
+ // remainders are handled in a separate stage to prevent branching
149
+ const unsigned int subWarpMask = (stride - 1);
150
+ const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
151
+ const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
152
+
153
+ const size_t warpCopies = (count & (~subWarpMask));
154
+
155
+ for (size_t idx = 0; idx < warpCopies; idx += stride) {
156
+ size_t _srcIdx = rank + idx;
157
+ size_t _dstIdx = rank + idx;
158
+ __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
159
+ }
160
+
161
+ if (subwarpCopies) {
162
+ size_t _srcIdx = warpCopies + maxSubwarpRank;
163
+ size_t _dstIdx = warpCopies + maxSubwarpRank;
164
+ __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
165
+ }
166
+ }
167
+
168
+ template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
169
+ _CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
170
+ const TyElem *__restrict__ src, size_t count) {
171
+ static_assert(async_copy_group_supported<TyGroup>::value,
172
+ "Async copy is only supported for groups that represent private shared memory");
173
+
174
+ const bool dstIsNotShared = !__isShared(dst);
175
+ const bool srcIsNotGlobal = !__isGlobal(src);
176
+
177
+ if (dstIsNotShared || srcIsNotGlobal) {
178
+ inline_copy(group, dst, src, count);
179
+ return;
180
+ }
181
+
182
+ unsigned int stride = group.size();
183
+ unsigned int rank = group.thread_rank();
184
+
185
+ for (size_t idx = rank; idx < count; idx += stride) {
186
+ size_t _srcIdx = idx;
187
+ size_t _dstIdx = idx;
188
+ __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
189
+ }
190
+ }
191
+
192
+ // Determine best possible alignment given an input and initial conditions
193
+ // Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
194
+ template <unsigned int MinAlignment, unsigned int MaxAlignment>
195
+ _CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
196
+ // Narrowing conversion intentional
197
+ uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
198
+ uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
199
+
200
+ uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
201
+
202
+ // range [MaxAlignment, alignof(elem)], step: x >> 1
203
+ // over range of possible alignments, choose best available out of range
204
+ uint32_t out = MaxAlignment;
205
+ #pragma unroll
206
+ for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
207
+ if (alignment & diff)
208
+ out = alignment;
209
+ }
210
+
211
+ return out;
212
+ }
213
+
214
+ // Determine best possible alignment given an input and initial conditions
215
+ // Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
216
+ template <typename TyType, typename TyGroup>
217
+ _CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
218
+ size_t count) {
219
+ const char *src = reinterpret_cast<const char *>(_src);
220
+ char *dst = reinterpret_cast<char *>(_dst);
221
+
222
+ constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
223
+
224
+ uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
225
+ uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
226
+
227
+ inline_copy(group, dst, src, alignOffset);
228
+ count -= alignOffset;
229
+ src += alignOffset;
230
+ dst += alignOffset;
231
+
232
+ // Copy using the best available alignment, async_copy expects n-datums, not bytes
233
+ size_t asyncCount = count / sizeof(TyType);
234
+ accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
235
+ asyncCount *= sizeof(TyType);
236
+
237
+ count -= asyncCount;
238
+ src += asyncCount;
239
+ dst += asyncCount;
240
+ inline_copy(group, dst, src, count);
241
+ }
242
+
243
+ // We must determine alignment and manually align src/dst ourselves
244
+ template <size_t AlignHint>
245
+ struct _memcpy_async_align_dispatch {
246
+ template <typename TyGroup>
247
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
248
+ uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
249
+
250
+ // Avoid copying the extra bytes if desired copy count is smaller
251
+ alignment = count < alignment ? AlignHint : alignment;
252
+
253
+ switch (alignment) {
254
+ default:
255
+ case 1:
256
+ inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
257
+ break;
258
+ case 2:
259
+ inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
260
+ break;
261
+ case 4:
262
+ copy_like<Segment<1>>(group, dst, src, count);
263
+ break;
264
+ case 8:
265
+ copy_like<Segment<2>>(group, dst, src, count);
266
+ break;
267
+ case 16:
268
+ copy_like<Segment<4>>(group, dst, src, count);
269
+ break;
270
+ }
271
+ }
272
+ };
273
+
274
+ // Specialization for 4 byte alignments
275
+ template <>
276
+ struct _memcpy_async_align_dispatch<4> {
277
+ template <typename TyGroup>
278
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
279
+ size_t count) {
280
+ const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
281
+ Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
282
+
283
+ // Dispatch straight to aligned LDGSTS calls
284
+ accelerated_async_copy(group, dst, src, count / sizeof(*dst));
285
+ }
286
+ };
287
+
288
+ // Specialization for 8 byte alignments
289
+ template <>
290
+ struct _memcpy_async_align_dispatch<8> {
291
+ template <typename TyGroup>
292
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
293
+ size_t count) {
294
+ const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
295
+ Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
296
+
297
+ // Dispatch straight to aligned LDGSTS calls
298
+ accelerated_async_copy(group, dst, src, count / sizeof(*dst));
299
+ }
300
+ };
301
+
302
+ // Alignments over 16 are truncated to 16 and bypass alignment
303
+ // This is the highest performing memcpy available
304
+ template <>
305
+ struct _memcpy_async_align_dispatch<16> {
306
+ template <typename TyGroup>
307
+ _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
308
+ size_t count) {
309
+ const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
310
+ Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
311
+
312
+ // Dispatch straight to aligned LDGSTS calls
313
+ accelerated_async_copy(group, dst, src, count / sizeof(*dst));
314
+ }
315
+ };
316
+
317
+ // byte-wide API
318
+ template <size_t Alignment, class TyGroup>
319
+ _CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
320
+ const void *__restrict__ _src, size_t count) {
321
+ static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
322
+ details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
323
+ }
324
+
325
+ // Internal dispatch APIs
326
+ // These deduce the alignments and sizes necessary to invoke the underlying copy engine
327
+ template <typename Ty>
328
+ using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
329
+
330
+ template <typename Ty>
331
+ using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
332
+
333
+ template <typename Ty>
334
+ using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
335
+
336
+ template <typename Ty>
337
+ using enable_if_integral =
338
+ typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
339
+
340
+ // byte-wide API using aligned_sized_t
341
+ template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
342
+ _CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
343
+ const void *__restrict__ _src, const Alignment<Hint> &count) {
344
+ constexpr size_t _align = (Hint > 16) ? 16 : Hint;
345
+
346
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
347
+ }
348
+
349
+ // byte-wide API using type for aligment
350
+ template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
351
+ enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
352
+ _CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
353
+ const TyElem *__restrict__ _src, const TySize& count) {
354
+ constexpr size_t _align = (Hint > 16) ? 16 : Hint;
355
+
356
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
357
+ }
358
+
359
+ // byte-wide API with full alignment deduction required
360
+ template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
361
+ enable_if_integral<TySize> = nullptr>
362
+ _CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
363
+ const TyElem *__restrict__ _src, const TySize& count) {
364
+ details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
365
+ }
366
+
367
+ // 1d-datum API
368
+ template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
369
+ _CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
370
+ const TyElem *__restrict__ src, const size_t srcCount) {
371
+ constexpr unsigned int _align = Hint;
372
+ const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
373
+
374
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
375
+ }
376
+
377
+ // 1d-datum API using aligned_size_t
378
+ template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
379
+ _CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
380
+ const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
381
+ constexpr unsigned int _align = Hint;
382
+ const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
383
+
384
+ details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
385
+ }
386
+
387
+ } // namespace details
388
+
389
+ /*
390
+ * Group submit batch of async-copy to cover contiguous 1D array
391
+ * and commit that batch to eventually wait for completion.
392
+ */
393
+ template <class TyGroup, typename TyElem, typename TySizeT>
394
+ _CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
395
+ const TySizeT &count) {
396
+ details::_memcpy_async_bytes(group, _dst, _src, count);
397
+ __pipeline_commit();
398
+ }
399
+
400
+ /*
401
+ * Group submit batch of async-copy to cover contiguous 1D array
402
+ * and commit that batch to eventually wait for completion.
403
+ * Object counts are in datum sized chunks, not bytes.
404
+ */
405
+ template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
406
+ _CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
407
+ const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
408
+ details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
409
+ __pipeline_commit();
410
+ }
411
+
412
+ /* Group wait for prior Nth stage of memcpy_async to complete. */
413
+ template <unsigned int Stage, class TyGroup>
414
+ _CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
415
+ __pipeline_wait_prior(Stage);
416
+ group.sync();
417
+ }
418
+
419
+ /* Group wait all previously submitted memcpy_async to complete. */
420
+ template <class TyGroup>
421
+ _CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
422
+ __pipeline_wait_prior(0);
423
+ group.sync();
424
+ }
425
+
426
+ /***************** CG APIs including pipeline are deprecated *****************/
427
+
428
+ /* Group submit batch of async-copy to cover of contiguous 1D array
429
+ to a pipeline and commit the batch*/
430
+ template <class TyGroup, class TyElem>
431
+ _CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
432
+ nvcuda::experimental::pipeline &pipe) {
433
+ details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
434
+ pipe.commit();
435
+ }
436
+
437
+ /* Group wait for prior Nth stage of memcpy_async to complete. */
438
+ template <unsigned int Stage, class TyGroup>
439
+ _CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
440
+ pipe.wait_prior<Stage>();
441
+ group.sync();
442
+ }
443
+
444
+ /* Group wait for stage-S of memcpy_async to complete. */
445
+ template <class TyGroup>
446
+ _CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
447
+ pipe.wait(stage);
448
+ group.sync();
449
+ }
450
+ _CG_END_NAMESPACE
451
+
452
+ #endif // _CG_ASYNC_H
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/coalesced_reduce.h ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_COALESCED_REDUCE_H_
50
+ #define _CG_COALESCED_REDUCE_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "cooperative_groups.h"
55
+ #include "partitioning.h"
56
+ #include "coalesced_scan.h"
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
63
+ _CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
64
+ TyVal&& val,
65
+ TyOp&& op) -> decltype(op(val, val)) {
66
+ auto out = val;
67
+ for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
68
+ out = op(out, group.shfl_xor(out, mask));
69
+ }
70
+
71
+ return out;
72
+ }
73
+
74
+ template <typename TyVal, typename TyOp>
75
+ _CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
76
+ if (group.size() == 32) {
77
+ // Full coalesced group can go through faster path by being treated as a tile of size 32
78
+ auto tile = details::tiled_partition_internal<32, void>();
79
+ return coalesced_reduce(tile, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
80
+ }
81
+ else {
82
+ auto scan_result =
83
+ inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
84
+ unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
85
+ unsigned int last_thread_id = 31 - __clz(group_mask);
86
+ return details::tile::shuffle_dispatch<TyVal>::shfl(
87
+ _CG_STL_NAMESPACE::forward<TyVal>(scan_result), group_mask, last_thread_id, 32);
88
+ }
89
+ }
90
+
91
+ } // details
92
+
93
+ _CG_END_NAMESPACE
94
+
95
+ #endif // _CG_COALESCED_REDUCE_H_
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/functional.h ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_FUNCTIONAL_H
50
+ #define _CG_FUNCTIONAL_H
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ #ifdef _CG_USE_CUDA_STL
57
+ # include <cuda/std/functional>
58
+ #endif
59
+
60
+ _CG_BEGIN_NAMESPACE
61
+
62
+ namespace details {
63
+ #ifdef _CG_USE_CUDA_STL
64
+ using cuda::std::plus;
65
+ using cuda::std::bit_and;
66
+ using cuda::std::bit_xor;
67
+ using cuda::std::bit_or;
68
+ #else
69
+ template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
70
+ template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
71
+ template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
72
+ template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
73
+ #endif // _CG_USE_PLATFORM_STL
74
+ } // details
75
+
76
+ template <typename Ty>
77
+ struct plus : public details::plus<Ty> {};
78
+
79
+ template <typename Ty>
80
+ struct less {
81
+ __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
82
+ return (arg2 < arg1) ? arg2 : arg1;
83
+ }
84
+ };
85
+
86
+ template <typename Ty>
87
+ struct greater {
88
+ __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
89
+ return (arg1 < arg2) ? arg2 : arg1;
90
+ }
91
+ };
92
+
93
+ template <typename Ty>
94
+ struct bit_and : public details::bit_and<Ty> {};
95
+
96
+ template <typename Ty>
97
+ struct bit_xor : public details::bit_xor<Ty> {};
98
+
99
+ template <typename Ty>
100
+ struct bit_or : public details::bit_or<Ty> {};
101
+
102
+ #if defined(_CG_HAS_STL_ATOMICS)
103
+ namespace details {
104
+ template <class Ty>
105
+ using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
106
+ _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
107
+
108
+ template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {};
109
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {};
110
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {};
111
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
112
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
113
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {};
114
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
115
+
116
+ template<typename TyAtomic, typename TyVal, typename TyOp>
117
+ _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
118
+ auto old = atomic.load(cuda::std::memory_order_relaxed);
119
+ while(!atomic.compare_exchange_weak(old, op(old, val), cuda::std::memory_order_relaxed));
120
+ return old;
121
+ }
122
+
123
+ template<typename TyOp>
124
+ struct op_picker;
125
+
126
+ template<typename TyVal>
127
+ struct op_picker<cooperative_groups::plus<TyVal>> {
128
+ template<typename TyAtomic>
129
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
130
+ return atomic.fetch_add(val, cuda::std::memory_order_relaxed);
131
+ }
132
+ };
133
+
134
+ template<typename TyVal>
135
+ struct op_picker<cooperative_groups::less<TyVal>> {
136
+ template<typename TyAtomic>
137
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
138
+ return atomic.fetch_min(val, cuda::std::memory_order_relaxed);
139
+ }
140
+ };
141
+
142
+ template<typename TyVal>
143
+ struct op_picker<cooperative_groups::greater<TyVal>> {
144
+ template<typename TyAtomic>
145
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
146
+ return atomic.fetch_max(val, cuda::std::memory_order_relaxed);
147
+ }
148
+ };
149
+
150
+ template<typename TyVal>
151
+ struct op_picker<cooperative_groups::bit_and<TyVal>> {
152
+ template<typename TyAtomic>
153
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
154
+ return atomic.fetch_and(val, cuda::std::memory_order_relaxed);
155
+ }
156
+ };
157
+
158
+ template<typename TyVal>
159
+ struct op_picker<cooperative_groups::bit_xor<TyVal>> {
160
+ template<typename TyAtomic>
161
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
162
+ return atomic.fetch_xor(val, cuda::std::memory_order_relaxed);
163
+ }
164
+ };
165
+
166
+ template<typename TyVal>
167
+ struct op_picker<cooperative_groups::bit_or<TyVal>> {
168
+ template<typename TyAtomic>
169
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
170
+ return atomic.fetch_or(val, cuda::std::memory_order_relaxed);
171
+ }
172
+ };
173
+
174
+ template<bool atomic_supported>
175
+ struct atomic_update_dispatch {};
176
+
177
+ template<>
178
+ struct atomic_update_dispatch<false> {
179
+ template<typename TyAtomic, typename TyVal, typename TyOp>
180
+ _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
181
+ return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
182
+ }
183
+ };
184
+
185
+ template<>
186
+ struct atomic_update_dispatch<true> {
187
+ template<typename TyAtomic, typename TyVal, typename TyOp>
188
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
189
+ using dispatch = op_picker<details::remove_qual<TyOp>>;
190
+
191
+ return dispatch::atomic_update(atomic, val);
192
+ }
193
+ };
194
+
195
+ template<typename TyAtomic, typename TyVal, typename TyOp>
196
+ _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
197
+ using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
198
+
199
+ return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
200
+ }
201
+
202
+ template<typename TyAtomic, typename TyVal>
203
+ _CG_QUALIFIER void atomic_store(TyAtomic& atomic, TyVal&& val) {
204
+ atomic.store(val, cuda::std::memory_order_relaxed);
205
+ }
206
+ }
207
+ #endif
208
+
209
+ _CG_END_NAMESPACE
210
+
211
+ #endif
212
+ #endif //_CG_FUNCTIONAL_H
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/helpers.h ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_HELPERS_H_
50
+ # define _COOPERATIVE_GROUPS_HELPERS_H_
51
+
52
+ #include "info.h"
53
+ #include "sync.h"
54
+
55
+ _CG_BEGIN_NAMESPACE
56
+
57
+ namespace details {
58
+ #ifdef _CG_CPP11_FEATURES
59
+ template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
60
+ # ifdef _CG_HAS_FP16_COLLECTIVE
61
+ template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {};
62
+ template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
63
+ # endif
64
+ template <typename Ty>
65
+ using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
66
+
67
+ // Non-STL utility templates
68
+ template <typename Ty>
69
+ using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
70
+
71
+ template <typename TyLhs, typename TyRhs>
72
+ using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
73
+ >;
74
+ #endif
75
+
76
+ template <typename TyTrunc>
77
+ _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
78
+ return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
79
+ ((TyTrunc)index.y * nIndex.x) +
80
+ (TyTrunc)index.x;
81
+ }
82
+
83
+ namespace cta {
84
+
85
+ _CG_STATIC_QUALIFIER void sync()
86
+ {
87
+ __barrier_sync(0);
88
+ }
89
+
90
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
91
+ {
92
+ return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
93
+ }
94
+
95
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
96
+ {
97
+ return vec3_to_linear<unsigned int>(threadIdx, blockDim);
98
+ }
99
+
100
+ _CG_STATIC_QUALIFIER dim3 group_index()
101
+ {
102
+ return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
103
+ }
104
+
105
+ _CG_STATIC_QUALIFIER dim3 thread_index()
106
+ {
107
+ return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
108
+ }
109
+
110
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
111
+ {
112
+ return dim3(blockDim.x, blockDim.y, blockDim.z);
113
+ }
114
+
115
+ // Legacy aliases
116
+ _CG_STATIC_QUALIFIER unsigned int size()
117
+ {
118
+ return num_threads();
119
+ }
120
+
121
+ _CG_STATIC_QUALIFIER dim3 block_dim()
122
+ {
123
+ return dim_threads();
124
+ }
125
+
126
+ };
127
+
128
+ class _coalesced_group_data_access {
129
+ public:
130
+ // Retrieve mask of coalesced groups and tiles
131
+ template <typename TyGroup>
132
+ _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
133
+ return group.get_mask();
134
+ }
135
+
136
+ template <typename TyGroup>
137
+ _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
138
+ return TyGroup(mask);
139
+ }
140
+
141
+ template <typename TyGroup>
142
+ _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
143
+ group._data.coalesced.metaGroupRank = mgRank;
144
+ group._data.coalesced.metaGroupSize = mgSize;
145
+ }
146
+ };
147
+
148
+ namespace tile {
149
+ template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
150
+ struct _tile_helpers{
151
+ _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
152
+ _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
153
+ _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
154
+ _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
155
+ };
156
+
157
+ template <unsigned int> struct tile_helpers;
158
+ template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {};
159
+ template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {};
160
+ template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {};
161
+ template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {};
162
+ template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
163
+ template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
164
+
165
+ #ifdef _CG_CPP11_FEATURES
166
+ namespace shfl {
167
+ /***********************************************************************************
168
+ * Recursively Sliced Shuffle
169
+ * Purpose:
170
+ * Slices an input type a number of times into integral types so that shuffles
171
+ * are well defined
172
+ * Expectations:
173
+ * This object *should not* be used from a reinterpret_cast pointer unless
174
+ * some alignment guarantees can be met. Use a memcpy to guarantee that loads
175
+ * from the integral types stored within are aligned and correct.
176
+ **********************************************************************************/
177
+ template <unsigned int count, bool intSized = (count <= sizeof(int))>
178
+ struct recursive_sliced_shuffle_helper;
179
+
180
+ template <unsigned int count>
181
+ struct recursive_sliced_shuffle_helper<count, true> {
182
+ int val;
183
+
184
+ template <typename TyFn>
185
+ _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
186
+ val = shfl(val);
187
+ }
188
+ };
189
+
190
+ template <unsigned int count>
191
+ struct recursive_sliced_shuffle_helper<count, false> {
192
+ int val;
193
+ recursive_sliced_shuffle_helper<count - sizeof(int)> next;
194
+
195
+ template <typename TyFn>
196
+ _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
197
+ val = shfl(val);
198
+ next.invoke_shuffle(shfl);
199
+ }
200
+ };
201
+ }
202
+
203
+ struct _memory_shuffle {
204
+ template <typename TyElem, typename TyShflFn>
205
+ _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
206
+ static_assert(sizeof(TyElem) <= 32, "Cooperative groups collectives are limited to types smaller than 32B");
207
+ return TyElem{};
208
+ }
209
+
210
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
211
+ _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
212
+ auto shfl = [=](int val) -> int {
213
+ return 0;
214
+ };
215
+
216
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
217
+ }
218
+
219
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
220
+ _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
221
+ auto shfl = [=](int val) -> int {
222
+ return 0;
223
+ };
224
+
225
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
226
+ }
227
+
228
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
229
+ _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
230
+ auto shfl = [=](int val) -> int {
231
+ return 0;
232
+ };
233
+
234
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
235
+ }
236
+
237
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
238
+ _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
239
+ auto shfl = [=](int val) -> int {
240
+ return 0;
241
+ };
242
+
243
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
244
+ }
245
+ };
246
+
247
+ /***********************************************************************************
248
+ * Intrinsic Device Function Shuffle
249
+ * Purpose:
250
+ * Uses a shuffle helper that has characteristics best suited for moving
251
+ * elements between threads
252
+ * Expectations:
253
+ * Object given will be forced into an l-value type so that it can be used
254
+ * with a helper structure that reinterprets the data into intrinsic compatible
255
+ * types
256
+ * Notes:
257
+ * !! TyRet is required so that objects are returned by value and not as
258
+ * dangling references depending on the value category of the passed object
259
+ **********************************************************************************/
260
+ struct _intrinsic_compat_shuffle {
261
+ template <unsigned int count>
262
+ using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
263
+
264
+ template <typename TyElem, typename TyShflFn>
265
+ _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
266
+ static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
267
+ shfl_helper<sizeof(TyElem)> helper;
268
+ memcpy(&helper, &elem, sizeof(TyElem));
269
+ helper.invoke_shuffle(fn);
270
+ memcpy(&elem, &helper, sizeof(TyElem));
271
+ return elem;
272
+ }
273
+
274
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
275
+ _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
276
+ auto shfl = [=](int val) -> int {
277
+ return __shfl_sync(gMask, val, srcRank, threads);
278
+ };
279
+
280
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
281
+ }
282
+
283
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
284
+ _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
285
+ auto shfl = [=](int val) -> int {
286
+ return __shfl_down_sync(gMask, val, delta, threads);
287
+ };
288
+
289
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
290
+ }
291
+
292
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
293
+ _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
294
+ auto shfl = [=](int val) -> int {
295
+ return __shfl_up_sync(gMask, val, delta, threads);
296
+ };
297
+
298
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
299
+ }
300
+
301
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
302
+ _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
303
+ auto shfl = [=](int val) -> int {
304
+ return __shfl_xor_sync(gMask, val, lMask, threads);
305
+ };
306
+
307
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
308
+ }
309
+ };
310
+
311
+ struct _native_shuffle {
312
+ template <typename TyElem>
313
+ _CG_STATIC_QUALIFIER TyElem shfl(
314
+ TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
315
+ return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
316
+ }
317
+
318
+ template <typename TyElem>
319
+ _CG_STATIC_QUALIFIER TyElem shfl_down(
320
+ TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
321
+ return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
322
+ }
323
+
324
+ template <typename TyElem>
325
+ _CG_STATIC_QUALIFIER TyElem shfl_up(
326
+ TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
327
+ return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
328
+ }
329
+
330
+ template <typename TyElem>
331
+ _CG_STATIC_QUALIFIER TyElem shfl_xor(
332
+ TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
333
+ return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
334
+ }
335
+ };
336
+
337
+ // Almost all arithmetic types are supported by native shuffle
338
+ // Vector types are the exception
339
+ template <typename TyElem>
340
+ using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
341
+ bool,
342
+ _CG_STL_NAMESPACE::is_integral<
343
+ remove_qual<TyElem>>::value ||
344
+ details::is_float_or_half<
345
+ remove_qual<TyElem>>::value
346
+ >;
347
+
348
+ constexpr unsigned long long _MemoryShuffleCutoff = 32;
349
+
350
+ template <typename TyElem,
351
+ bool IsNative = use_native_shuffle<TyElem>::value,
352
+ bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
353
+ struct shuffle_dispatch;
354
+
355
+ template <typename TyElem>
356
+ struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {};
357
+
358
+ template <typename TyElem>
359
+ struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
360
+
361
+ template <typename TyElem>
362
+ struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {};
363
+
364
+ #endif //_CG_CPP11_FEATURES
365
+ };
366
+
367
+ namespace multi_grid {
368
+ struct multi_grid_functions;
369
+ };
370
+
371
+ namespace grid {
372
+ _CG_STATIC_QUALIFIER unsigned int barrier_arrive(unsigned int *bar) {
373
+ return details::sync_grids_arrive(bar);
374
+ }
375
+
376
+ _CG_STATIC_QUALIFIER void barrier_wait(unsigned int token, unsigned int *bar) {
377
+ details::sync_grids_wait(token, bar);
378
+ }
379
+
380
+ _CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
381
+ unsigned int token = details::sync_grids_arrive(bar);
382
+ details::sync_grids_wait(token, bar);
383
+ }
384
+
385
+ _CG_STATIC_QUALIFIER unsigned long long num_blocks()
386
+ {
387
+ // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
388
+ // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication
389
+ return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
390
+ }
391
+
392
+ _CG_STATIC_QUALIFIER unsigned long long num_threads()
393
+ {
394
+ return num_blocks() * cta::num_threads();
395
+ }
396
+
397
+ _CG_STATIC_QUALIFIER unsigned long long block_rank()
398
+ {
399
+ return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
400
+ }
401
+
402
+ _CG_STATIC_QUALIFIER unsigned long long thread_rank()
403
+ {
404
+ return block_rank() * cta::num_threads() + cta::thread_rank();
405
+ }
406
+
407
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
408
+ {
409
+ return dim3(gridDim.x, gridDim.y, gridDim.z);
410
+ }
411
+
412
+ _CG_STATIC_QUALIFIER dim3 block_index()
413
+ {
414
+ return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
415
+ }
416
+
417
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
418
+ {
419
+ return dim3(gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z);
420
+ }
421
+
422
+ _CG_STATIC_QUALIFIER dim3 thread_index()
423
+ {
424
+ return dim3(blockIdx.x * blockDim.x + threadIdx.x,
425
+ blockIdx.y * blockDim.y + threadIdx.y,
426
+ blockIdx.z * blockDim.z + threadIdx.z);
427
+ }
428
+
429
+ #if defined(_CG_HAS_CLUSTER_GROUP)
430
+ _CG_STATIC_QUALIFIER dim3 dim_clusters() {
431
+ return __clusterGridDimInClusters();
432
+ }
433
+
434
+ _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
435
+ const dim3 dimClusters = dim_clusters();
436
+ return dimClusters.x * dimClusters.y * dimClusters.z;
437
+ }
438
+
439
+ _CG_STATIC_QUALIFIER dim3 cluster_index() {
440
+ return __clusterIdx();
441
+ }
442
+
443
+ _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
444
+ return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
445
+ }
446
+ #endif
447
+
448
+ // Legacy aliases
449
+ _CG_STATIC_QUALIFIER unsigned long long size()
450
+ {
451
+ return num_threads();
452
+ }
453
+
454
+ _CG_STATIC_QUALIFIER dim3 grid_dim()
455
+ {
456
+ return dim_blocks();
457
+ }
458
+ };
459
+
460
+
461
+ #if defined(_CG_HAS_MULTI_GRID_GROUP)
462
+
463
+ namespace multi_grid {
464
+ _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
465
+ {
466
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
467
+ //this function is defined in device runtime library
468
+ //which requires separate compilation mode (__CUDACC_RDC__)
469
+ //or extended whole program mode (__CUDACC_EWP__)
470
+ return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
471
+ #else /* !(__CUDACC_RDC__ || __CUDACC_EWP__) */
472
+ return 0;
473
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
474
+ }
475
+
476
+ _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
477
+ {
478
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
479
+ //this function is defined in device runtime library
480
+ //which requires separate compilation mode (__CUDACC_RDC__)
481
+ //or extended whole program mode (__CUDACC_EWP__)
482
+ cudaError_t err = cudaCGSynchronize(handle, 0);
483
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
484
+ }
485
+
486
+ _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
487
+ {
488
+ unsigned int numThreads = 0;
489
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
490
+ //this function is defined in device runtime library
491
+ //which requires separate compilation mode (__CUDACC_RDC__)
492
+ //or extended whole program mode (__CUDACC_EWP__)
493
+ cudaCGGetSize(&numThreads, NULL, handle);
494
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
495
+ return numThreads;
496
+ }
497
+
498
+ _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
499
+ {
500
+ unsigned int threadRank = 0;
501
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
502
+ //this function is defined in device runtime library
503
+ //which requires separate compilation mode (__CUDACC_RDC__)
504
+ //or extended whole program mode (__CUDACC_EWP__)
505
+ cudaCGGetRank(&threadRank, NULL, handle);
506
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
507
+ return threadRank;
508
+ }
509
+
510
+ _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
511
+ {
512
+ unsigned int gridRank = 0;
513
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
514
+ //this function is defined in device runtime library
515
+ //which requires separate compilation mode (__CUDACC_RDC__)
516
+ //or extended whole program mode (__CUDACC_EWP__)
517
+ cudaCGGetRank(NULL, &gridRank, handle);
518
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
519
+ return gridRank;
520
+ }
521
+
522
+ _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
523
+ {
524
+ unsigned int numGrids = 0;
525
+ #if defined(__CUDACC_RDC__) || defined(__CUDACC_EWP__)
526
+ //this function is defined in device runtime library
527
+ //which requires separate compilation mode (__CUDACC_RDC__)
528
+ //or extended whole program mode (__CUDACC_EWP__)
529
+ cudaCGGetSize(NULL, &numGrids, handle);
530
+ #endif /* __CUDACC_RDC__ || __CUDACC_EWP__ */
531
+ return numGrids;
532
+ }
533
+
534
+ # ifdef _CG_CPP11_FEATURES
535
+ struct multi_grid_functions {
536
+ decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
537
+ decltype(multi_grid::sync) *sync;
538
+ decltype(multi_grid::size) *size;
539
+ decltype(multi_grid::thread_rank) *thread_rank;
540
+ decltype(multi_grid::grid_rank) *grid_rank;
541
+ decltype(multi_grid::num_grids) *num_grids;
542
+ };
543
+
544
+ template <typename = void>
545
+ _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
546
+ __constant__ static const multi_grid_functions mgf {
547
+ &multi_grid::get_intrinsic_handle,
548
+ &multi_grid::sync,
549
+ &multi_grid::size,
550
+ &multi_grid::thread_rank,
551
+ &multi_grid::grid_rank,
552
+ &multi_grid::num_grids
553
+ };
554
+
555
+ return &mgf;
556
+ }
557
+ # endif
558
+ };
559
+ #endif
560
+
561
+ #if defined(_CG_HAS_CLUSTER_GROUP)
562
+ namespace cluster {
563
+
564
+ _CG_STATIC_QUALIFIER bool isReal()
565
+ {
566
+ return __clusterDimIsSpecified();
567
+ }
568
+
569
+ _CG_STATIC_QUALIFIER void barrier_arrive()
570
+ {
571
+ __cluster_barrier_arrive();
572
+ }
573
+
574
+ _CG_STATIC_QUALIFIER void barrier_wait()
575
+ {
576
+ __cluster_barrier_wait();
577
+ }
578
+
579
+ _CG_STATIC_QUALIFIER void sync()
580
+ {
581
+ barrier_arrive();
582
+ barrier_wait();
583
+ }
584
+
585
+ _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
586
+ {
587
+ return __cluster_query_shared_rank(addr);
588
+ }
589
+
590
+ template <typename T>
591
+ _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
592
+ {
593
+ return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
594
+ }
595
+
596
+ _CG_STATIC_QUALIFIER dim3 block_index()
597
+ {
598
+ return __clusterRelativeBlockIdx();
599
+ }
600
+
601
+ _CG_STATIC_QUALIFIER unsigned int block_rank()
602
+ {
603
+ return __clusterRelativeBlockRank();
604
+ }
605
+
606
+ _CG_STATIC_QUALIFIER dim3 thread_index()
607
+ {
608
+ const dim3 blockIndex = block_index();
609
+ return dim3(blockIndex.x * blockDim.x + threadIdx.x,
610
+ blockIndex.y * blockDim.y + threadIdx.y,
611
+ blockIndex.z * blockDim.z + threadIdx.z);
612
+ }
613
+
614
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
615
+ {
616
+ return block_rank() * cta::num_threads() + cta::thread_rank();
617
+ }
618
+
619
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
620
+ {
621
+ return __clusterDim();
622
+ }
623
+
624
+ _CG_STATIC_QUALIFIER unsigned int num_blocks()
625
+ {
626
+ return __clusterSizeInBlocks();
627
+ }
628
+
629
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
630
+ {
631
+ const dim3 dimBlocks = dim_blocks();
632
+ const unsigned int x = dimBlocks.x * blockDim.x;
633
+ const unsigned int y = dimBlocks.y * blockDim.y;
634
+ const unsigned int z = dimBlocks.z * blockDim.z;
635
+ return dim3(x, y, z);
636
+ }
637
+
638
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
639
+ {
640
+ return num_blocks() * cta::num_threads();
641
+ }
642
+
643
+ };
644
+ #endif
645
+
646
+ _CG_STATIC_QUALIFIER unsigned int laneid()
647
+ {
648
+ unsigned int laneid;
649
+ asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
650
+ return laneid;
651
+ }
652
+
653
+ _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
654
+ {
655
+ unsigned int lanemask32_eq;
656
+ asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
657
+ return (lanemask32_eq);
658
+ }
659
+
660
+ _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
661
+ {
662
+ unsigned int lanemask32_lt;
663
+ asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
664
+ return (lanemask32_lt);
665
+ }
666
+
667
+ _CG_STATIC_QUALIFIER void abort()
668
+ {
669
+ _CG_ABORT();
670
+ }
671
+
672
+ template <typename Ty>
673
+ _CG_QUALIFIER void assert_if_not_arithmetic() {
674
+ #ifdef _CG_CPP11_FEATURES
675
+ static_assert(
676
+ _CG_STL_NAMESPACE::is_integral<Ty>::value ||
677
+ details::is_float_or_half<Ty>::value,
678
+ "Error: Ty is neither integer or float"
679
+ );
680
+ #endif //_CG_CPP11_FEATURES
681
+ }
682
+
683
+ #ifdef _CG_CPP11_FEATURES
684
+ _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
685
+ return x == 1 ? 0 : 1 + log2(x / 2);
686
+ }
687
+ #endif //_CG_CPP11_FEATURES
688
+
689
+ }; // !Namespace internal
690
+
691
+ _CG_END_NAMESPACE
692
+
693
+ #endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/memory.h ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_MEMORY_H_
50
+ # define _COOPERATIVE_GROUPS_MEMORY_H_
51
+
52
+ #include "info.h"
53
+
54
+ _CG_BEGIN_NAMESPACE
55
+
56
+ #if defined(_CG_CPP11_FEATURES)
57
+ namespace details {
58
+ _CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
59
+
60
+ #if defined(_CG_HAS_RESERVED_SHARED)
61
+ _CG_STATIC_QUALIFIER void* reserved_shared_ptr()
62
+ {
63
+ void *ptr;
64
+ asm ("{\n\t"
65
+ " .reg .u32 start;\n\t"
66
+ " .reg .u64 extended;\n\t"
67
+ " mov.u32 start, %%reserved_smem_offset_1;\n\t"
68
+ " cvt.u64.u32 extended, start;\n\t"
69
+ " cvta.shared.u64 %0, extended;\n\t"
70
+ "}"
71
+ : "=" _CG_ASM_PTR_CONSTRAINT(ptr));
72
+ return ptr;
73
+ }
74
+ #endif
75
+
76
+ struct multi_warp_scratch {
77
+ // One barrier per possible size of the group.
78
+ _CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
79
+ _CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
80
+
81
+ using communication_type = unsigned long long;
82
+ _CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
83
+
84
+ // Layout of the scratch space:
85
+ barrier_t barriers[memory_barriers_count];
86
+ char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
87
+ communication_type communication_memory[default_max_block_size / 32];
88
+
89
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
90
+ // One slot of collectives memory per warp.
91
+ return scratch_num_reserved_bytes + sync_memory_size + max_block_size / 32 * communication_size;
92
+ }
93
+
94
+ _CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
95
+ if (thread_rank < memory_barriers_count) {
96
+ barriers[thread_rank] = 0;
97
+ }
98
+ }
99
+ };
100
+
101
+ #if defined(_CG_HAS_RESERVED_SHARED)
102
+ // CG can expect at least 288 bytes available in reserved shared
103
+ static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
104
+ #endif
105
+
106
+ // Make sure the structure can fit into the user provided memory
107
+ static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
108
+ "multi-warp scratch size is too large");
109
+
110
+
111
+ _CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
112
+ void *ptr;
113
+ #if defined(_CG_HAS_RESERVED_SHARED)
114
+ ptr = reserved_shared_ptr();
115
+ #else
116
+ ptr = user_scratch;
117
+ #endif
118
+ return static_cast<multi_warp_scratch*>(ptr);
119
+
120
+ }
121
+
122
+ }
123
+
124
+ template <unsigned int MaxBlockSize = details::default_max_block_size>
125
+ struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
126
+ private:
127
+ #if !defined(_CG_HAS_RESERVED_SHARED)
128
+ char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
129
+ #endif
130
+ };
131
+ #endif
132
+
133
+ _CG_END_NAMESPACE
134
+
135
+ #endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/reduce.h ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_REDUCE_H_
50
+ #define _CG_REDUCE_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "coalesced_reduce.h"
55
+ #include "functional.h"
56
+ #include "cooperative_groups.h"
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <class Ty>
63
+ using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
64
+ bool,
65
+ _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
66
+
67
+ template <class Ty>
68
+ using redux_is_add_supported = _redux_is_add_supported<Ty>;
69
+
70
+ // A specialization for 64 bit logical operations is possible
71
+ // but for now only accelerate 32 bit bitwise ops
72
+ template <class Ty>
73
+ using redux_is_logical_supported = redux_is_add_supported<Ty>;
74
+
75
+ // Base operator support case
76
+ template <class TyOp, class Ty> struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {};
77
+ #ifdef _CG_HAS_OP_REDUX
78
+ template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>, Ty> : public redux_is_add_supported<Ty> {};
79
+ template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>, Ty> : public redux_is_add_supported<Ty> {};
80
+ template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
81
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
82
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
83
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
84
+ #endif
85
+
86
+ template <class Ty, template <class> class TyOp>
87
+ using redux_op_supported = _redux_op_supported<
88
+ typename details::remove_qual<TyOp<Ty>>,
89
+ Ty>;
90
+
91
+ // Groups smaller than 16 actually have worse performance characteristics when used with redux
92
+ // tiles of size 16 and 32 perform the same or better and have better code generation profiles
93
+ template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
94
+
95
+ template <unsigned int Sz, typename TyPar>
96
+ struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
97
+ bool,
98
+ (Sz >= 16)> {};
99
+ template <unsigned int Sz, typename TyPar>
100
+ struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
101
+ bool,
102
+ (Sz >= 16)> {};
103
+ template <>
104
+ struct _redux_group_optimized<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
105
+
106
+ template <typename TyGroup>
107
+ using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
108
+
109
+ template <template <class> class TyOp>
110
+ _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
111
+ template <template <class> class TyOp>
112
+ _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
113
+
114
+ #ifdef _CG_HAS_OP_REDUX
115
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
116
+ return __reduce_add_sync(mask, val);
117
+ }
118
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
119
+ return __reduce_min_sync(mask, val);
120
+ }
121
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
122
+ return __reduce_max_sync(mask, val);
123
+ }
124
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
125
+ return __reduce_and_sync(mask, val);
126
+ }
127
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
128
+ return __reduce_xor_sync(mask, val);
129
+ }
130
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
131
+ return __reduce_or_sync(mask, val);
132
+ }
133
+
134
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
135
+ return __reduce_add_sync(mask, val);
136
+ }
137
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
138
+ return __reduce_min_sync(mask, val);
139
+ }
140
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
141
+ return __reduce_max_sync(mask, val);
142
+ }
143
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
144
+ return __reduce_and_sync(mask, val);
145
+ }
146
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
147
+ return __reduce_xor_sync(mask, val);
148
+ }
149
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
150
+ return __reduce_or_sync(mask, val);
151
+ }
152
+ #endif
153
+
154
+
155
+ template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
156
+ struct _accelerated_op;
157
+
158
+ // Signed type redux intrinsic dispatch
159
+ template <typename TyVal>
160
+ struct _accelerated_op<TyVal, false> {
161
+ template <template <class> class TyOp>
162
+ _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
163
+ return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
164
+ }
165
+ };
166
+
167
+ // Unsigned type redux intrinsic dispatch
168
+ template <typename TyVal>
169
+ struct _accelerated_op<TyVal, true> {
170
+ template <template <class> class TyOp>
171
+ _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
172
+ return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
173
+ }
174
+ };
175
+
176
+ template <typename TyVal>
177
+ using accelerated_op = _accelerated_op<TyVal>;
178
+
179
+
180
+ template <typename TyVal, typename TyFnInput, typename TyGroup>
181
+ class _redux_dispatch {
182
+ template <class Ty, template <class> class TyOp>
183
+ using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
184
+ redux_op_supported<Ty, TyOp>::value &&
185
+ redux_group_optimized<TyGroup>::value>;
186
+
187
+ template <class Ty, template <class> class TyOp>
188
+ using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
189
+
190
+ template <class Ty, template <class> class TyOp>
191
+ using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
192
+
193
+ public:
194
+ // Dispatch to redux if the combination of op and args are supported
195
+ template<
196
+ template <class> class TyOp,
197
+ redux_is_usable<TyFnInput, TyOp> = nullptr>
198
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
199
+ // Retrieve the mask for the group and dispatch to redux
200
+ return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
201
+ }
202
+
203
+ template<
204
+ template <class> class TyOp,
205
+ redux_is_usable<TyFnInput, TyOp> = nullptr>
206
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
207
+ // Retrieve the mask for the group and dispatch to redux
208
+ return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
209
+ }
210
+
211
+ // Fallback shuffle sync reduction
212
+ template <
213
+ template <class> class TyOp,
214
+ redux_is_not_usable<TyFnInput, TyOp> = nullptr>
215
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
216
+ //Dispatch to fallback shuffle sync accelerated reduction
217
+ return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
218
+ }
219
+
220
+ };
221
+
222
+ // Group support for reduce.
223
+ template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
224
+
225
+ template <unsigned int Sz, typename TyPar>
226
+ struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
227
+ template <unsigned int Sz, typename TyPar>
228
+ struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
229
+ template <>
230
+ struct _reduce_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
231
+
232
+ template <typename TyGroup>
233
+ using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
234
+
235
+ template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
236
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
237
+ static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
238
+
239
+ using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
240
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
241
+ }
242
+
243
+ template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
244
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
245
+ static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
246
+
247
+ using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
248
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
249
+ }
250
+
251
+
252
+ template <typename TyVal, typename TyOp, typename TyGroup>
253
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
254
+ return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
255
+ }
256
+
257
+ template <unsigned int GroupId>
258
+ struct tile_reduce_dispatch;
259
+
260
+ template <>
261
+ struct tile_reduce_dispatch<details::coalesced_group_id> {
262
+ template <typename TyGroup, typename TyVal, typename TyFn>
263
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
264
+ return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
265
+ }
266
+ };
267
+
268
+ #if defined(_CG_CPP11_FEATURES)
269
+ template <>
270
+ struct tile_reduce_dispatch<details::multi_tile_group_id> {
271
+ template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
272
+ _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
273
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
274
+ using TyRet = details::remove_qual<TyVal>;
275
+ const unsigned int num_warps = Size / 32;
276
+
277
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
278
+ *warp_scratch_location =
279
+ details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
280
+ };
281
+ auto inter_warp_lambda =
282
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
283
+ *thread_scratch_location =
284
+ details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
285
+ };
286
+ return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
287
+ }
288
+ };
289
+
290
+ template <unsigned int GroupId>
291
+ struct tile_async_reduce_dispatch;
292
+
293
+ template <>
294
+ struct tile_async_reduce_dispatch<details::coalesced_group_id> {
295
+ template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
296
+ _CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
297
+ // Do regular, in group reduction
298
+ auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
299
+
300
+ // One thread stores/updates the destination
301
+ if (group.thread_rank() == 0) {
302
+ res_handler(result);
303
+ }
304
+ }
305
+ };
306
+
307
+ template <>
308
+ struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
309
+ template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
310
+ _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
311
+ using TyVal = remove_qual<TyInputVal>;
312
+ const unsigned int num_warps = TySize / 32;
313
+ details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
314
+ auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
315
+
316
+ // Do in warp reduce
317
+ auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
318
+ *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
319
+
320
+ // Tile of size num_warps from the last warp to arrive does final reduction step
321
+ if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
322
+ auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
323
+ if (subwarp.meta_group_rank() == 0) {
324
+ auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
325
+ auto thread_val = *thread_scratch_location;
326
+ // Release other warps, we read their contribution already.
327
+ subwarp.sync();
328
+ details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
329
+ TyVal result = details::reduce(subwarp, thread_val, op);
330
+ // One thread stores the result or updates the atomic
331
+ if (subwarp.thread_rank() == 0) {
332
+ res_handler(result);
333
+ }
334
+ }
335
+ warp.sync();
336
+ }
337
+ }
338
+ };
339
+ #endif
340
+
341
+ template <typename TyGroup, typename TyInputVal, typename TyRetVal>
342
+ _CG_QUALIFIER void check_reduce_params() {
343
+ static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
344
+ static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
345
+ };
346
+
347
+ template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
348
+ _CG_QUALIFIER void check_async_reduce_params() {
349
+ check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
350
+ static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
351
+ }
352
+ } // details
353
+
354
+ template <typename TyGroup, typename TyVal, typename TyFn>
355
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
356
+ details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
357
+
358
+ using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
359
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
360
+ }
361
+
362
+ #if defined(_CG_CPP11_FEATURES)
363
+
364
+ # if defined(_CG_HAS_STL_ATOMICS)
365
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
366
+ void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
367
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
368
+ auto update_lambda = [&] (TyVal& result) {
369
+ details::atomic_update(dst, result, op);
370
+ };
371
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
372
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
373
+ }
374
+
375
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
376
+ void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
377
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
378
+ auto update_lambda = [&] (TyVal& result) {
379
+ details::atomic_update(dst, result, op);
380
+ };
381
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
382
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
383
+ }
384
+
385
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
386
+ void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
387
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
388
+ auto store_lambda = [&] (TyVal& result) {
389
+ details::atomic_store(dst, result);
390
+ };
391
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
392
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
393
+ }
394
+
395
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
396
+ void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
397
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
398
+ auto store_lambda = [&] (TyVal& result) {
399
+ details::atomic_store(dst, result);
400
+ };
401
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
402
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
403
+ }
404
+ # endif
405
+
406
+ template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
407
+ void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
408
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
409
+ auto store_lambda = [&] (TyVal& result) {
410
+ *dst = result;
411
+ };
412
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
413
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
414
+ }
415
+ #endif
416
+
417
+ _CG_END_NAMESPACE
418
+
419
+ #endif // _CG_REDUCE_H_
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/details/scan.h ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_SCAN_H_
50
+ #define _CG_SCAN_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "functional.h"
55
+ #include "coalesced_scan.h"
56
+
57
+ _CG_BEGIN_NAMESPACE
58
+
59
+ namespace details {
60
+
61
+ // Group support for scan.
62
+ template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
63
+
64
+ template <unsigned int Sz, typename TyPar>
65
+ struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
66
+ template <unsigned int Sz, typename TyPar>
67
+ struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
68
+ template <>
69
+ struct _scan_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
70
+
71
+ template <typename TyGroup>
72
+ using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
73
+
74
+ template <bool IsIntegralPlus>
75
+ struct integral_optimized_scan;
76
+
77
+ enum class ScanType { exclusive, inclusive };
78
+
79
+ template <unsigned int GroupId, ScanType TyScan>
80
+ struct scan_dispatch;
81
+
82
+ template <ScanType TyScan>
83
+ struct scan_dispatch<details::coalesced_group_id, TyScan> {
84
+ template <typename TyGroup, typename TyVal, typename TyFn>
85
+ _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
86
+ auto scan_result = coalesced_inclusive_scan(group, val, op);
87
+ if (TyScan == ScanType::exclusive) {
88
+ scan_result = convert_inclusive_to_exclusive(group,
89
+ scan_result,
90
+ _CG_STL_NAMESPACE::forward<TyVal>(val),
91
+ _CG_STL_NAMESPACE::forward<TyFn>(op));
92
+ }
93
+ return scan_result;
94
+ }
95
+ };
96
+
97
+ #if defined(_CG_CPP11_FEATURES)
98
+ template <ScanType TyScan>
99
+ struct scan_dispatch<details::multi_tile_group_id, TyScan> {
100
+ template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
101
+ _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
102
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
103
+ using TyRet = details::remove_qual<TyVal>;
104
+ const unsigned int num_warps = Size / 32;
105
+ // In warp scan result, calculated in warp_lambda
106
+ TyRet warp_scan;
107
+
108
+ // In warp scan, put sum in the warp_scratch_location
109
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
110
+ warp_scan =
111
+ details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
112
+ if (warp.thread_rank() + 1 == warp.size()) {
113
+ *warp_scratch_location = warp_scan;
114
+ }
115
+ if (TyScan == ScanType::exclusive) {
116
+ warp_scan = warp.shfl_up(warp_scan, 1);
117
+ }
118
+ };
119
+
120
+ // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
121
+ // to its in-warp scan result
122
+ auto inter_warp_lambda =
123
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
124
+ auto thread_val = *thread_scratch_location;
125
+ auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
126
+ *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
127
+ };
128
+
129
+ TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
130
+ if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
131
+ return previous_warps_sum;
132
+ }
133
+ if (warpType::meta_group_rank() == 0) {
134
+ return warp_scan;
135
+ }
136
+ else {
137
+ return op(warp_scan, previous_warps_sum);
138
+ }
139
+ }
140
+ };
141
+
142
+ #if defined(_CG_HAS_STL_ATOMICS)
143
+ template <unsigned int GroupId, ScanType TyScan>
144
+ struct scan_update_dispatch;
145
+
146
+ template <ScanType TyScan>
147
+ struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
148
+ template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
149
+ _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
150
+ details::remove_qual<TyVal> old;
151
+
152
+ // Do regular in group scan
153
+ auto scan_result = details::coalesced_inclusive_scan(group, val, op);
154
+
155
+ // Last thread updates the atomic and distributes its old value to other threads
156
+ if (group.thread_rank() == group.size() - 1) {
157
+ old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
158
+ }
159
+ old = group.shfl(old, group.size() - 1);
160
+ if (TyScan == ScanType::exclusive) {
161
+ scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
162
+ }
163
+ scan_result = op(old, scan_result);
164
+ return scan_result;
165
+ }
166
+ };
167
+
168
+ template <ScanType TyScan>
169
+ struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
170
+ template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
171
+ _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
172
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
173
+ using TyRet = details::remove_qual<TyVal>;
174
+ const unsigned int num_warps = Size / 32;
175
+ // In warp scan result, calculated in warp_lambda
176
+ TyRet warp_scan;
177
+
178
+ // In warp scan, put sum in the warp_scratch_location
179
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
180
+ warp_scan =
181
+ details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
182
+ if (warp.thread_rank() + 1 == warp.size()) {
183
+ *warp_scratch_location = warp_scan;
184
+ }
185
+ if (TyScan == ScanType::exclusive) {
186
+ warp_scan = warp.shfl_up(warp_scan, 1);
187
+ }
188
+ };
189
+
190
+ // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
191
+ // to its in-warp scan result
192
+ auto inter_warp_lambda =
193
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
194
+ auto thread_val = *thread_scratch_location;
195
+ auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
196
+ TyRet offset;
197
+ // Single thread does the atomic update with sum of all contributions and reads the old value.
198
+ if (subwarp.thread_rank() == subwarp.size() - 1) {
199
+ offset = details::atomic_update(dst, scan_result, op);
200
+ }
201
+ offset = subwarp.shfl(offset, subwarp.size() - 1);
202
+ scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
203
+ // Add offset read from the atomic to the scanned warp sum.
204
+ // Skipping first thread, since it got defautly constructed value from the conversion,
205
+ // it should just return the offset received from the thread that did the atomic update.
206
+ if (subwarp.thread_rank() != 0) {
207
+ offset = op(scan_result, offset);
208
+ }
209
+ *thread_scratch_location = offset;
210
+ };
211
+
212
+ TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
213
+ if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
214
+ return previous_warps_sum;
215
+ }
216
+ return op(warp_scan, previous_warps_sum);
217
+ }
218
+ };
219
+ #endif
220
+ #endif
221
+
222
+ template <typename TyGroup, typename TyInputVal, typename TyRetVal>
223
+ _CG_QUALIFIER void check_scan_params() {
224
+ static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
225
+ static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
226
+ }
227
+
228
+ #if defined(_CG_HAS_STL_ATOMICS)
229
+ template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
230
+ _CG_QUALIFIER void check_scan_update_params() {
231
+ check_scan_params<TyGroup, TyInputVal, TyRetVal>();
232
+ static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
233
+ }
234
+ #endif
235
+
236
+ } // details
237
+
238
+ template <typename TyGroup, typename TyVal, typename TyFn>
239
+ _CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
240
+ details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
241
+
242
+ using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
243
+ return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
244
+ }
245
+
246
+ template <typename TyGroup, typename TyVal>
247
+ _CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
248
+ return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
249
+ }
250
+
251
+ template <typename TyGroup, typename TyVal, typename TyFn>
252
+ _CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
253
+ details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
254
+
255
+ using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
256
+ return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
257
+ }
258
+
259
+ template <typename TyGroup, typename TyVal>
260
+ _CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
261
+ return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
262
+ }
263
+
264
+ #if defined(_CG_HAS_STL_ATOMICS)
265
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
266
+ _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
267
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
268
+
269
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
270
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
271
+ }
272
+
273
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
274
+ _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
275
+ return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
276
+ }
277
+
278
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
279
+ _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
280
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
281
+
282
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
283
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
284
+ }
285
+
286
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
287
+ _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
288
+ return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
289
+ }
290
+
291
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
292
+ _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
293
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
294
+
295
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
296
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
297
+ }
298
+
299
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
300
+ _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
301
+ return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
302
+ }
303
+
304
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
305
+ _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
306
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
307
+
308
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
309
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
310
+ }
311
+
312
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
313
+ _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
314
+ return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
315
+ }
316
+ #endif
317
+
318
+ _CG_END_NAMESPACE
319
+
320
+ #endif // _CG_SCAN_H_
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/memcpy_async.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
50
+ #define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/async.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+ #endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/reduce.h ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_REDUCE_H
50
+ #define _COOPERATIVE_GROUPS_REDUCE_H
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/reduce.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+
63
+ #endif //_COOPERATIVE_GROUPS_REDUCE_H
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cooperative_groups/scan.h ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_SCAN_H
50
+ #define _COOPERATIVE_GROUPS_SCAN_H
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/scan.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+
63
+ #endif //_COOPERATIVE_GROUPS_SCAN_H
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/common_functions.h ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/common_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/common_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
58
+ #endif
59
+
60
+ #if !defined(__COMMON_FUNCTIONS_H__)
61
+ #define __COMMON_FUNCTIONS_H__
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ #include "builtin_types.h"
72
+ #include "host_defines.h"
73
+
74
+ #define __CUDACC_VER__ "__CUDACC_VER__ is no longer supported. Use __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, and __CUDACC_VER_BUILD__ instead."
75
+
76
+ #ifndef __CUDA_API_VER_MAJOR__
77
+ #define __CUDA_API_VER_MAJOR__ __CUDACC_VER_MAJOR__
78
+ #endif /* __CUDA_API_VER_MAJOR__ */
79
+
80
+ #ifndef __CUDA_API_VER_MINOR__
81
+ #define __CUDA_API_VER_MINOR__ __CUDACC_VER_MINOR__
82
+ #endif /* __CUDA_API_VER_MINOR__ */
83
+
84
+ #if !defined(__CUDACC_RTC__)
85
+ #include <string.h>
86
+ #include <time.h>
87
+
88
+ extern "C"
89
+ {
90
+ #endif /* !__CUDACC_RTC__ */
91
+ extern _CRTIMP __host__ __device__ __device_builtin__ __cudart_builtin__ clock_t __cdecl clock(void)
92
+ #if defined(__QNX__)
93
+ asm("clock32")
94
+ #endif
95
+ __THROW;
96
+ extern __host__ __device__ __device_builtin__ __cudart_builtin__ void* __cdecl memset(void*, int, size_t) __THROW;
97
+ extern __host__ __device__ __device_builtin__ __cudart_builtin__ void* __cdecl memcpy(void*, const void*, size_t) __THROW;
98
+ #if !defined(__CUDACC_RTC__)
99
+ }
100
+ #endif /* !__CUDACC_RTC__ */
101
+
102
+ #if defined(__CUDA_ARCH__)
103
+
104
+ #if defined(__CUDACC_RTC__)
105
+ inline __host__ __device__ void* operator new(size_t, void *p) { return p; }
106
+ inline __host__ __device__ void* operator new[](size_t, void *p) { return p; }
107
+ inline __host__ __device__ void operator delete(void*, void*) { }
108
+ inline __host__ __device__ void operator delete[](void*, void*) { }
109
+ #else /* !__CUDACC_RTC__ */
110
+ #ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
111
+ #include <new>
112
+ #endif
113
+
114
+ #if defined (__GNUC__)
115
+
116
+ #define STD \
117
+ std::
118
+
119
+ #else /* __GNUC__ */
120
+
121
+ #define STD
122
+
123
+ #endif /* __GNUC__ */
124
+
125
+ extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t, void*) throw();
126
+ extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t, void*) throw();
127
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, void*) throw();
128
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, void*) throw();
129
+ # if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
130
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t) throw();
131
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t) throw();
132
+ #endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__) */
133
+ #endif /* __CUDACC_RTC__ */
134
+
135
+ #if !defined(__CUDACC_RTC__)
136
+ #include <stdio.h>
137
+ #include <stdlib.h>
138
+ #endif /* !__CUDACC_RTC__ */
139
+
140
+ #if defined(__QNX__) && !defined(_LIBCPP_VERSION)
141
+ namespace std {
142
+ #endif
143
+ extern "C"
144
+ {
145
+ extern
146
+ #if !defined(_MSC_VER) || _MSC_VER < 1900
147
+ _CRTIMP
148
+ #endif
149
+
150
+ #if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) )
151
+ __host__ __device__ __device_builtin__ __cudart_builtin__ int __cdecl printf(const char*, ...) __THROW;
152
+ #else /* newer glibc */
153
+ __host__ __device__ __device_builtin__ __cudart_builtin__ int __cdecl printf(const char*, ...);
154
+ #endif /* defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) */
155
+
156
+
157
+ extern _CRTIMP __host__ __device__ __cudart_builtin__ void* __cdecl malloc(size_t) __THROW;
158
+ extern _CRTIMP __host__ __device__ __cudart_builtin__ void __cdecl free(void*) __THROW;
159
+
160
+ #if defined(_MSC_VER)
161
+ extern __host__ __device__ __cudart_builtin__ void* __cdecl _alloca(size_t);
162
+ #endif
163
+
164
+ #if defined(__QNX__)
165
+ #undef alloca
166
+ #define alloca(__S) __builtin_alloca(__S)
167
+ #endif
168
+ }
169
+ #if defined(__QNX__) && !defined(_LIBCPP_VERSION)
170
+ } /* std */
171
+ #endif
172
+
173
+ #if !defined(__CUDACC_RTC__)
174
+ #include <assert.h>
175
+ #endif /* !__CUDACC_RTC__ */
176
+
177
+ extern "C"
178
+ {
179
+ #if defined(__CUDACC_RTC__)
180
+ extern __host__ __device__ void __assertfail(const char * __assertion,
181
+ const char *__file,
182
+ unsigned int __line,
183
+ const char *__function,
184
+ size_t charsize);
185
+ #elif defined(__APPLE__)
186
+ #define __builtin_expect(exp,c) (exp)
187
+ extern __host__ __device__ __cudart_builtin__ void __assert_rtn(
188
+ const char *, const char *, int, const char *);
189
+ #elif defined(__ANDROID__)
190
+ extern __host__ __device__ __cudart_builtin__ void __assert2(
191
+ const char *, int, const char *, const char *);
192
+ #elif defined(__QNX__)
193
+ #if !defined(_LIBCPP_VERSION)
194
+ namespace std {
195
+ #endif
196
+ extern __host__ __device__ __cudart_builtin__ void __assert(
197
+ const char *, const char *, unsigned int, const char *);
198
+ #if !defined(_LIBCPP_VERSION)
199
+ }
200
+ #endif
201
+ #elif defined(__HORIZON__)
202
+ extern __host__ __device__ __cudart_builtin__ void __assert_fail(
203
+ const char *, const char *, int, const char *);
204
+ #elif defined(__GNUC__)
205
+ extern __host__ __device__ __cudart_builtin__ void __assert_fail(
206
+ const char *, const char *, unsigned int, const char *)
207
+ __THROW;
208
+ #elif defined(_WIN32)
209
+ extern __host__ __device__ __cudart_builtin__ _CRTIMP void __cdecl _wassert(
210
+ const wchar_t *, const wchar_t *, unsigned);
211
+ #endif
212
+ }
213
+
214
+ #if defined(__CUDACC_RTC__)
215
+ #ifdef NDEBUG
216
+ #define assert(e) (static_cast<void>(0))
217
+ #else /* !NDEBUG */
218
+ #define __ASSERT_STR_HELPER(x) #x
219
+ #define assert(e) ((e) ? static_cast<void>(0)\
220
+ : __assertfail(__ASSERT_STR_HELPER(e), __FILE__,\
221
+ __LINE__, __PRETTY_FUNCTION__,\
222
+ sizeof(char)))
223
+ #endif /* NDEBUG */
224
+ __host__ __device__ void* operator new(size_t);
225
+ __host__ __device__ void* operator new[](size_t);
226
+ __host__ __device__ void operator delete(void*);
227
+ __host__ __device__ void operator delete[](void*);
228
+ # if __cplusplus >= 201402L
229
+ __host__ __device__ void operator delete(void*, size_t);
230
+ __host__ __device__ void operator delete[](void*, size_t);
231
+ #endif /* __cplusplus >= 201402L */
232
+
233
+ #if __cplusplus >= 201703L
234
+ namespace std { enum class align_val_t : size_t {}; }
235
+ __host__ __device__ void* __cdecl operator new(size_t sz, std::align_val_t) noexcept;
236
+ __host__ __device__ void* __cdecl operator new[](size_t sz, std::align_val_t) noexcept;
237
+ __host__ __device__ void __cdecl operator delete(void* ptr, std::align_val_t) noexcept;
238
+ __host__ __device__ void __cdecl operator delete[](void* ptr, std::align_val_t) noexcept;
239
+ __host__ __device__ void __cdecl operator delete(void* ptr, size_t, std::align_val_t) noexcept;
240
+ __host__ __device__ void __cdecl operator delete[](void* ptr, size_t, std::align_val_t) noexcept;
241
+ #endif /* __cplusplus >= 201703L */
242
+
243
+ #else /* !__CUDACC_RTC__ */
244
+ #if defined (__GNUC__)
245
+
246
+ #define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
247
+
248
+ #if (__cplusplus >= 201103L) && ((!(defined(__QNX__) && defined(_LIBCPP_VERSION))) || (defined(__QNX__) && __NV_GLIBCXX_VERSION >= 80300))
249
+ #define THROWBADALLOC
250
+ #else
251
+ #if defined(__ANDROID__) && !defined(_LIBCPP_VERSION) && (defined(__BIONIC__) || __NV_GLIBCXX_VERSION < 40900)
252
+ #define THROWBADALLOC
253
+ #else
254
+ #define THROWBADALLOC throw(STD bad_alloc)
255
+ #endif
256
+ #endif
257
+ #define __DELETE_THROW throw()
258
+
259
+ #undef __NV_GLIBCXX_VERSION
260
+
261
+ #else /* __GNUC__ */
262
+
263
+ #define THROWBADALLOC throw(...)
264
+
265
+ #endif /* __GNUC__ */
266
+
267
+ extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t) THROWBADALLOC;
268
+ extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t) THROWBADALLOC;
269
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*) throw();
270
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*) throw();
271
+ # if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
272
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t) throw();
273
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t) throw();
274
+ #endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__) */
275
+
276
+ #if __cpp_aligned_new
277
+ extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new(STD size_t, std::align_val_t);
278
+ extern __host__ __device__ __cudart_builtin__ void* __cdecl operator new[](STD size_t, std::align_val_t);
279
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, std::align_val_t) noexcept;
280
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, std::align_val_t) noexcept;
281
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete(void*, STD size_t, std::align_val_t) noexcept;
282
+ extern __host__ __device__ __cudart_builtin__ void __cdecl operator delete[](void*, STD size_t, std::align_val_t) noexcept;
283
+ #endif /* __cpp_aligned_new */
284
+
285
+ #undef THROWBADALLOC
286
+ #undef STD
287
+ #endif /* __CUDACC_RTC__ */
288
+
289
+ #endif /* __CUDA_ARCH__ */
290
+
291
+ #endif /* __cplusplus && __CUDACC__ */
292
+
293
+ /*******************************************************************************
294
+ * *
295
+ * *
296
+ * *
297
+ *******************************************************************************/
298
+
299
+ #if defined(__CUDACC_RTC__) && (__CUDA_ARCH__ >= 350)
300
+ #include "cuda_device_runtime_api.h"
301
+ #endif
302
+
303
+ #include "math_functions.h"
304
+
305
+ #endif /* !__COMMON_FUNCTIONS_H__ */
306
+
307
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__)
308
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
309
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
310
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2021-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/cudacc_ext.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/cudacc_ext.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
58
+ #endif
59
+
60
+
61
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__)
62
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
63
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
64
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.h ADDED
@@ -0,0 +1,1192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/device_double_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/device_double_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
58
+ #endif
59
+
60
+ #if !defined(__DEVICE_DOUBLE_FUNCTIONS_H__)
61
+ #define __DEVICE_DOUBLE_FUNCTIONS_H__
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ /*******************************************************************************
72
+ * *
73
+ * *
74
+ * *
75
+ *******************************************************************************/
76
+
77
+ #if defined(__CUDACC_RTC__)
78
+ #define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
79
+ #else
80
+ #define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
81
+ #endif /* __CUDACC_RTC__ */
82
+
83
+ #include "builtin_types.h"
84
+ #include "device_types.h"
85
+ #include "host_defines.h"
86
+
87
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
88
+ #define EXCLUDE_FROM_RTC
89
+
90
+ extern "C"
91
+ {
92
+ /**
93
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
94
+ * \brief Reinterpret bits in a double as a 64-bit signed integer.
95
+ *
96
+ * Reinterpret the bits in the double-precision floating-point value \p x
97
+ * as a signed 64-bit integer.
98
+ * \return Returns reinterpreted value.
99
+ */
100
+ extern __device__ __device_builtin__ long long int __double_as_longlong(double x);
101
+ /**
102
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
103
+ * \brief Reinterpret bits in a 64-bit signed integer as a double.
104
+ *
105
+ * Reinterpret the bits in the 64-bit signed integer value \p x as
106
+ * a double-precision floating-point value.
107
+ * \return Returns reinterpreted value.
108
+ */
109
+ extern __device__ __device_builtin__ double __longlong_as_double(long long int x);
110
+ /**
111
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
112
+ * \brief Compute
113
+ * \latexonly $x \times y + z$ \endlatexonly
114
+ * \xmlonly
115
+ * <d4p_MathML outputclass="xmlonly">
116
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
117
+ * <m:mi>x</m:mi>
118
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
119
+ * <m:mi>y</m:mi>
120
+ * <m:mo>+</m:mo>
121
+ * <m:mi>z</m:mi>
122
+ * </m:math>
123
+ * </d4p_MathML>
124
+ * \endxmlonly
125
+ * as a single operation in round-to-nearest-even mode.
126
+ *
127
+ * Computes the value of
128
+ * \latexonly $x \times y + z$ \endlatexonly
129
+ * \xmlonly
130
+ * <d4p_MathML outputclass="xmlonly">
131
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
132
+ * <m:mi>x</m:mi>
133
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
134
+ * <m:mi>y</m:mi>
135
+ * <m:mo>+</m:mo>
136
+ * <m:mi>z</m:mi>
137
+ * </m:math>
138
+ * </d4p_MathML>
139
+ * \endxmlonly
140
+ * as a single ternary operation, rounding the
141
+ * result once in round-to-nearest-even mode.
142
+ *
143
+ * \return Returns the rounded value of
144
+ * \latexonly $x \times y + z$ \endlatexonly
145
+ * \xmlonly
146
+ * <d4p_MathML outputclass="xmlonly">
147
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
148
+ * <m:mi>x</m:mi>
149
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
150
+ * <m:mi>y</m:mi>
151
+ * <m:mo>+</m:mo>
152
+ * <m:mi>z</m:mi>
153
+ * </m:math>
154
+ * </d4p_MathML>
155
+ * \endxmlonly
156
+ * as a single operation.
157
+ * - fmaf(
158
+ * \latexonly $\pm \infty$ \endlatexonly
159
+ * \xmlonly
160
+ * <d4p_MathML outputclass="xmlonly">
161
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
162
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
163
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
164
+ * </m:math>
165
+ * </d4p_MathML>
166
+ * \endxmlonly
167
+ * ,
168
+ * \latexonly $\pm 0$ \endlatexonly
169
+ * \xmlonly
170
+ * <d4p_MathML outputclass="xmlonly">
171
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
172
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
173
+ * <m:mn>0</m:mn>
174
+ * </m:math>
175
+ * </d4p_MathML>
176
+ * \endxmlonly
177
+ * , \p z) returns NaN.
178
+ * - fmaf(
179
+ * \latexonly $\pm 0$ \endlatexonly
180
+ * \xmlonly
181
+ * <d4p_MathML outputclass="xmlonly">
182
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
183
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
184
+ * <m:mn>0</m:mn>
185
+ * </m:math>
186
+ * </d4p_MathML>
187
+ * \endxmlonly
188
+ * ,
189
+ * \latexonly $\pm \infty$ \endlatexonly
190
+ * \xmlonly
191
+ * <d4p_MathML outputclass="xmlonly">
192
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
193
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
194
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
195
+ * </m:math>
196
+ * </d4p_MathML>
197
+ * \endxmlonly
198
+ * , \p z) returns NaN.
199
+ * - fmaf(\p x, \p y,
200
+ * \latexonly $-\infty$ \endlatexonly
201
+ * \xmlonly
202
+ * <d4p_MathML outputclass="xmlonly">
203
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
204
+ * <m:mo>-</m:mo>
205
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
206
+ * </m:math>
207
+ * </d4p_MathML>
208
+ * \endxmlonly
209
+ * ) returns NaN if
210
+ * \latexonly $x \times y$ \endlatexonly
211
+ * \xmlonly
212
+ * <d4p_MathML outputclass="xmlonly">
213
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
214
+ * <m:mi>x</m:mi>
215
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
216
+ * <m:mi>y</m:mi>
217
+ * </m:math>
218
+ * </d4p_MathML>
219
+ * \endxmlonly
220
+ * is an exact
221
+ * \latexonly $+\infty$ \endlatexonly
222
+ * \xmlonly
223
+ * <d4p_MathML outputclass="xmlonly">
224
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
225
+ * <m:mo>+</m:mo>
226
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
227
+ * </m:math>
228
+ * </d4p_MathML>
229
+ * \endxmlonly
230
+ * .
231
+ * - fmaf(\p x, \p y,
232
+ * \latexonly $+\infty$ \endlatexonly
233
+ * \xmlonly
234
+ * <d4p_MathML outputclass="xmlonly">
235
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
236
+ * <m:mo>+</m:mo>
237
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
238
+ * </m:math>
239
+ * </d4p_MathML>
240
+ * \endxmlonly
241
+ * ) returns NaN if
242
+ * \latexonly $x \times y$ \endlatexonly
243
+ * \xmlonly
244
+ * <d4p_MathML outputclass="xmlonly">
245
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
246
+ * <m:mi>x</m:mi>
247
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
248
+ * <m:mi>y</m:mi>
249
+ * </m:math>
250
+ * </d4p_MathML>
251
+ * \endxmlonly
252
+ * is an exact
253
+ * \latexonly $-\infty$ \endlatexonly
254
+ * \xmlonly
255
+ * <d4p_MathML outputclass="xmlonly">
256
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
257
+ * <m:mo>-</m:mo>
258
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
259
+ * </m:math>
260
+ * </d4p_MathML>
261
+ * \endxmlonly
262
+ * .
263
+ *
264
+ * \note_accuracy_double
265
+ */
266
+ extern __device__ __device_builtin__ double __fma_rn(double x, double y, double z);
267
+ /**
268
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
269
+ * \brief Compute
270
+ * \latexonly $x \times y + z$ \endlatexonly
271
+ * \xmlonly
272
+ * <d4p_MathML outputclass="xmlonly">
273
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
274
+ * <m:mi>x</m:mi>
275
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
276
+ * <m:mi>y</m:mi>
277
+ * <m:mo>+</m:mo>
278
+ * <m:mi>z</m:mi>
279
+ * </m:math>
280
+ * </d4p_MathML>
281
+ * \endxmlonly
282
+ * as a single operation in round-towards-zero mode.
283
+ *
284
+ * Computes the value of
285
+ * \latexonly $x \times y + z$ \endlatexonly
286
+ * \xmlonly
287
+ * <d4p_MathML outputclass="xmlonly">
288
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
289
+ * <m:mi>x</m:mi>
290
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
291
+ * <m:mi>y</m:mi>
292
+ * <m:mo>+</m:mo>
293
+ * <m:mi>z</m:mi>
294
+ * </m:math>
295
+ * </d4p_MathML>
296
+ * \endxmlonly
297
+ * as a single ternary operation, rounding the
298
+ * result once in round-towards-zero mode.
299
+ *
300
+ * \return Returns the rounded value of
301
+ * \latexonly $x \times y + z$ \endlatexonly
302
+ * \xmlonly
303
+ * <d4p_MathML outputclass="xmlonly">
304
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
305
+ * <m:mi>x</m:mi>
306
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
307
+ * <m:mi>y</m:mi>
308
+ * <m:mo>+</m:mo>
309
+ * <m:mi>z</m:mi>
310
+ * </m:math>
311
+ * </d4p_MathML>
312
+ * \endxmlonly
313
+ * as a single operation.
314
+ * - fmaf(
315
+ * \latexonly $\pm \infty$ \endlatexonly
316
+ * \xmlonly
317
+ * <d4p_MathML outputclass="xmlonly">
318
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
319
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
320
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
321
+ * </m:math>
322
+ * </d4p_MathML>
323
+ * \endxmlonly
324
+ * ,
325
+ * \latexonly $\pm 0$ \endlatexonly
326
+ * \xmlonly
327
+ * <d4p_MathML outputclass="xmlonly">
328
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
329
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
330
+ * <m:mn>0</m:mn>
331
+ * </m:math>
332
+ * </d4p_MathML>
333
+ * \endxmlonly
334
+ * , \p z) returns NaN.
335
+ * - fmaf(
336
+ * \latexonly $\pm 0$ \endlatexonly
337
+ * \xmlonly
338
+ * <d4p_MathML outputclass="xmlonly">
339
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
340
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
341
+ * <m:mn>0</m:mn>
342
+ * </m:math>
343
+ * </d4p_MathML>
344
+ * \endxmlonly
345
+ * ,
346
+ * \latexonly $\pm \infty$ \endlatexonly
347
+ * \xmlonly
348
+ * <d4p_MathML outputclass="xmlonly">
349
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
350
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
351
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
352
+ * </m:math>
353
+ * </d4p_MathML>
354
+ * \endxmlonly
355
+ * , \p z) returns NaN.
356
+ * - fmaf(\p x, \p y,
357
+ * \latexonly $-\infty$ \endlatexonly
358
+ * \xmlonly
359
+ * <d4p_MathML outputclass="xmlonly">
360
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
361
+ * <m:mo>-</m:mo>
362
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
363
+ * </m:math>
364
+ * </d4p_MathML>
365
+ * \endxmlonly
366
+ * ) returns NaN if
367
+ * \latexonly $x \times y$ \endlatexonly
368
+ * \xmlonly
369
+ * <d4p_MathML outputclass="xmlonly">
370
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
371
+ * <m:mi>x</m:mi>
372
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
373
+ * <m:mi>y</m:mi>
374
+ * </m:math>
375
+ * </d4p_MathML>
376
+ * \endxmlonly
377
+ * is an exact
378
+ * \latexonly $+\infty$ \endlatexonly
379
+ * \xmlonly
380
+ * <d4p_MathML outputclass="xmlonly">
381
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
382
+ * <m:mo>+</m:mo>
383
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
384
+ * </m:math>
385
+ * </d4p_MathML>
386
+ * \endxmlonly
387
+ * .
388
+ * - fmaf(\p x, \p y,
389
+ * \latexonly $+\infty$ \endlatexonly
390
+ * \xmlonly
391
+ * <d4p_MathML outputclass="xmlonly">
392
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
393
+ * <m:mo>+</m:mo>
394
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
395
+ * </m:math>
396
+ * </d4p_MathML>
397
+ * \endxmlonly
398
+ * ) returns NaN if
399
+ * \latexonly $x \times y$ \endlatexonly
400
+ * \xmlonly
401
+ * <d4p_MathML outputclass="xmlonly">
402
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
403
+ * <m:mi>x</m:mi>
404
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
405
+ * <m:mi>y</m:mi>
406
+ * </m:math>
407
+ * </d4p_MathML>
408
+ * \endxmlonly
409
+ * is an exact
410
+ * \latexonly $-\infty$ \endlatexonly
411
+ * \xmlonly
412
+ * <d4p_MathML outputclass="xmlonly">
413
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
414
+ * <m:mo>-</m:mo>
415
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
416
+ * </m:math>
417
+ * </d4p_MathML>
418
+ * \endxmlonly
419
+ * .
420
+ *
421
+ * \note_accuracy_double
422
+ */
423
+ extern __device__ __device_builtin__ double __fma_rz(double x, double y, double z);
424
+ /**
425
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
426
+ * \brief Compute
427
+ * \latexonly $x \times y + z$ \endlatexonly
428
+ * \xmlonly
429
+ * <d4p_MathML outputclass="xmlonly">
430
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
431
+ * <m:mi>x</m:mi>
432
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
433
+ * <m:mi>y</m:mi>
434
+ * <m:mo>+</m:mo>
435
+ * <m:mi>z</m:mi>
436
+ * </m:math>
437
+ * </d4p_MathML>
438
+ * \endxmlonly
439
+ * as a single operation in round-up mode.
440
+ *
441
+ * Computes the value of
442
+ * \latexonly $x \times y + z$ \endlatexonly
443
+ * \xmlonly
444
+ * <d4p_MathML outputclass="xmlonly">
445
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
446
+ * <m:mi>x</m:mi>
447
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
448
+ * <m:mi>y</m:mi>
449
+ * <m:mo>+</m:mo>
450
+ * <m:mi>z</m:mi>
451
+ * </m:math>
452
+ * </d4p_MathML>
453
+ * \endxmlonly
454
+ * as a single ternary operation, rounding the
455
+ * result once in round-up (to positive infinity) mode.
456
+ *
457
+ * \return Returns the rounded value of
458
+ * \latexonly $x \times y + z$ \endlatexonly
459
+ * \xmlonly
460
+ * <d4p_MathML outputclass="xmlonly">
461
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
462
+ * <m:mi>x</m:mi>
463
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
464
+ * <m:mi>y</m:mi>
465
+ * <m:mo>+</m:mo>
466
+ * <m:mi>z</m:mi>
467
+ * </m:math>
468
+ * </d4p_MathML>
469
+ * \endxmlonly
470
+ * as a single operation.
471
+ * - fmaf(
472
+ * \latexonly $\pm \infty$ \endlatexonly
473
+ * \xmlonly
474
+ * <d4p_MathML outputclass="xmlonly">
475
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
476
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
477
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
478
+ * </m:math>
479
+ * </d4p_MathML>
480
+ * \endxmlonly
481
+ * ,
482
+ * \latexonly $\pm 0$ \endlatexonly
483
+ * \xmlonly
484
+ * <d4p_MathML outputclass="xmlonly">
485
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
486
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
487
+ * <m:mn>0</m:mn>
488
+ * </m:math>
489
+ * </d4p_MathML>
490
+ * \endxmlonly
491
+ * , \p z) returns NaN.
492
+ * - fmaf(
493
+ * \latexonly $\pm 0$ \endlatexonly
494
+ * \xmlonly
495
+ * <d4p_MathML outputclass="xmlonly">
496
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
497
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
498
+ * <m:mn>0</m:mn>
499
+ * </m:math>
500
+ * </d4p_MathML>
501
+ * \endxmlonly
502
+ * ,
503
+ * \latexonly $\pm \infty$ \endlatexonly
504
+ * \xmlonly
505
+ * <d4p_MathML outputclass="xmlonly">
506
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
507
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
508
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
509
+ * </m:math>
510
+ * </d4p_MathML>
511
+ * \endxmlonly
512
+ * , \p z) returns NaN.
513
+ * - fmaf(\p x, \p y,
514
+ * \latexonly $-\infty$ \endlatexonly
515
+ * \xmlonly
516
+ * <d4p_MathML outputclass="xmlonly">
517
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
518
+ * <m:mo>-</m:mo>
519
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
520
+ * </m:math>
521
+ * </d4p_MathML>
522
+ * \endxmlonly
523
+ * ) returns NaN if
524
+ * \latexonly $x \times y$ \endlatexonly
525
+ * \xmlonly
526
+ * <d4p_MathML outputclass="xmlonly">
527
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
528
+ * <m:mi>x</m:mi>
529
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
530
+ * <m:mi>y</m:mi>
531
+ * </m:math>
532
+ * </d4p_MathML>
533
+ * \endxmlonly
534
+ * is an exact
535
+ * \latexonly $+\infty$ \endlatexonly
536
+ * \xmlonly
537
+ * <d4p_MathML outputclass="xmlonly">
538
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
539
+ * <m:mo>+</m:mo>
540
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
541
+ * </m:math>
542
+ * </d4p_MathML>
543
+ * \endxmlonly
544
+ * .
545
+ * - fmaf(\p x, \p y,
546
+ * \latexonly $+\infty$ \endlatexonly
547
+ * \xmlonly
548
+ * <d4p_MathML outputclass="xmlonly">
549
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
550
+ * <m:mo>+</m:mo>
551
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
552
+ * </m:math>
553
+ * </d4p_MathML>
554
+ * \endxmlonly
555
+ * ) returns NaN if
556
+ * \latexonly $x \times y$ \endlatexonly
557
+ * \xmlonly
558
+ * <d4p_MathML outputclass="xmlonly">
559
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
560
+ * <m:mi>x</m:mi>
561
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
562
+ * <m:mi>y</m:mi>
563
+ * </m:math>
564
+ * </d4p_MathML>
565
+ * \endxmlonly
566
+ * is an exact
567
+ * \latexonly $-\infty$ \endlatexonly
568
+ * \xmlonly
569
+ * <d4p_MathML outputclass="xmlonly">
570
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
571
+ * <m:mo>-</m:mo>
572
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
573
+ * </m:math>
574
+ * </d4p_MathML>
575
+ * \endxmlonly
576
+ * .
577
+ *
578
+ * \note_accuracy_double
579
+ */
580
+ extern __device__ __device_builtin__ double __fma_ru(double x, double y, double z);
581
+ /**
582
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
583
+ * \brief Compute
584
+ * \latexonly $x \times y + z$ \endlatexonly
585
+ * \xmlonly
586
+ * <d4p_MathML outputclass="xmlonly">
587
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
588
+ * <m:mi>x</m:mi>
589
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
590
+ * <m:mi>y</m:mi>
591
+ * <m:mo>+</m:mo>
592
+ * <m:mi>z</m:mi>
593
+ * </m:math>
594
+ * </d4p_MathML>
595
+ * \endxmlonly
596
+ * as a single operation in round-down mode.
597
+ *
598
+ * Computes the value of
599
+ * \latexonly $x \times y + z$ \endlatexonly
600
+ * \xmlonly
601
+ * <d4p_MathML outputclass="xmlonly">
602
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
603
+ * <m:mi>x</m:mi>
604
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
605
+ * <m:mi>y</m:mi>
606
+ * <m:mo>+</m:mo>
607
+ * <m:mi>z</m:mi>
608
+ * </m:math>
609
+ * </d4p_MathML>
610
+ * \endxmlonly
611
+ * as a single ternary operation, rounding the
612
+ * result once in round-down (to negative infinity) mode.
613
+ *
614
+ * \return Returns the rounded value of
615
+ * \latexonly $x \times y + z$ \endlatexonly
616
+ * \xmlonly
617
+ * <d4p_MathML outputclass="xmlonly">
618
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
619
+ * <m:mi>x</m:mi>
620
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
621
+ * <m:mi>y</m:mi>
622
+ * <m:mo>+</m:mo>
623
+ * <m:mi>z</m:mi>
624
+ * </m:math>
625
+ * </d4p_MathML>
626
+ * \endxmlonly
627
+ * as a single operation.
628
+ * - fmaf(
629
+ * \latexonly $\pm \infty$ \endlatexonly
630
+ * \xmlonly
631
+ * <d4p_MathML outputclass="xmlonly">
632
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
633
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
634
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
635
+ * </m:math>
636
+ * </d4p_MathML>
637
+ * \endxmlonly
638
+ * ,
639
+ * \latexonly $\pm 0$ \endlatexonly
640
+ * \xmlonly
641
+ * <d4p_MathML outputclass="xmlonly">
642
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
643
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
644
+ * <m:mn>0</m:mn>
645
+ * </m:math>
646
+ * </d4p_MathML>
647
+ * \endxmlonly
648
+ * , \p z) returns NaN.
649
+ * - fmaf(
650
+ * \latexonly $\pm 0$ \endlatexonly
651
+ * \xmlonly
652
+ * <d4p_MathML outputclass="xmlonly">
653
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
654
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
655
+ * <m:mn>0</m:mn>
656
+ * </m:math>
657
+ * </d4p_MathML>
658
+ * \endxmlonly
659
+ * ,
660
+ * \latexonly $\pm \infty$ \endlatexonly
661
+ * \xmlonly
662
+ * <d4p_MathML outputclass="xmlonly">
663
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
664
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
665
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
666
+ * </m:math>
667
+ * </d4p_MathML>
668
+ * \endxmlonly
669
+ * , \p z) returns NaN.
670
+ * - fmaf(\p x, \p y,
671
+ * \latexonly $-\infty$ \endlatexonly
672
+ * \xmlonly
673
+ * <d4p_MathML outputclass="xmlonly">
674
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
675
+ * <m:mo>-</m:mo>
676
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
677
+ * </m:math>
678
+ * </d4p_MathML>
679
+ * \endxmlonly
680
+ * ) returns NaN if
681
+ * \latexonly $x \times y$ \endlatexonly
682
+ * \xmlonly
683
+ * <d4p_MathML outputclass="xmlonly">
684
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
685
+ * <m:mi>x</m:mi>
686
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
687
+ * <m:mi>y</m:mi>
688
+ * </m:math>
689
+ * </d4p_MathML>
690
+ * \endxmlonly
691
+ * is an exact
692
+ * \latexonly $+\infty$ \endlatexonly
693
+ * \xmlonly
694
+ * <d4p_MathML outputclass="xmlonly">
695
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
696
+ * <m:mo>+</m:mo>
697
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
698
+ * </m:math>
699
+ * </d4p_MathML>
700
+ * \endxmlonly
701
+ * .
702
+ * - fmaf(\p x, \p y,
703
+ * \latexonly $+\infty$ \endlatexonly
704
+ * \xmlonly
705
+ * <d4p_MathML outputclass="xmlonly">
706
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
707
+ * <m:mo>+</m:mo>
708
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
709
+ * </m:math>
710
+ * </d4p_MathML>
711
+ * \endxmlonly
712
+ * ) returns NaN if
713
+ * \latexonly $x \times y$ \endlatexonly
714
+ * \xmlonly
715
+ * <d4p_MathML outputclass="xmlonly">
716
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
717
+ * <m:mi>x</m:mi>
718
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
719
+ * <m:mi>y</m:mi>
720
+ * </m:math>
721
+ * </d4p_MathML>
722
+ * \endxmlonly
723
+ * is an exact
724
+ * \latexonly $-\infty$ \endlatexonly
725
+ * \xmlonly
726
+ * <d4p_MathML outputclass="xmlonly">
727
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
728
+ * <m:mo>-</m:mo>
729
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
730
+ * </m:math>
731
+ * </d4p_MathML>
732
+ * \endxmlonly
733
+ * .
734
+ *
735
+ * \note_accuracy_double
736
+ */
737
+ extern __device__ __device_builtin__ double __fma_rd(double x, double y, double z);
738
+ /**
739
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
740
+ * \brief Add two floating-point values in round-to-nearest-even mode.
741
+ *
742
+ * Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
743
+ *
744
+ * \return Returns \p x + \p y.
745
+ *
746
+ * \note_accuracy_double
747
+ * \note_nofma
748
+ */
749
+ extern __device__ __device_builtin__ double __dadd_rn(double x, double y);
750
+ /**
751
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
752
+ * \brief Add two floating-point values in round-towards-zero mode.
753
+ *
754
+ * Adds two floating-point values \p x and \p y in round-towards-zero mode.
755
+ *
756
+ * \return Returns \p x + \p y.
757
+ *
758
+ * \note_accuracy_double
759
+ * \note_nofma
760
+ */
761
+ extern __device__ __device_builtin__ double __dadd_rz(double x, double y);
762
+ /**
763
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
764
+ * \brief Add two floating-point values in round-up mode.
765
+ *
766
+ * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
767
+ *
768
+ * \return Returns \p x + \p y.
769
+ *
770
+ * \note_accuracy_double
771
+ * \note_nofma
772
+ */
773
+ extern __device__ __device_builtin__ double __dadd_ru(double x, double y);
774
+ /**
775
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
776
+ * \brief Add two floating-point values in round-down mode.
777
+ *
778
+ * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
779
+ *
780
+ * \return Returns \p x + \p y.
781
+ *
782
+ * \note_accuracy_double
783
+ * \note_nofma
784
+ */
785
+ extern __device__ __device_builtin__ double __dadd_rd(double x, double y);
786
+ /**
787
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
788
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
789
+ *
790
+ * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
791
+ *
792
+ * \return Returns \p x - \p y.
793
+ *
794
+ * \note_accuracy_double
795
+ * \note_nofma
796
+ */
797
+ extern __device__ __device_builtin__ double __dsub_rn(double x, double y);
798
+ /**
799
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
800
+ * \brief Subtract two floating-point values in round-towards-zero mode.
801
+ *
802
+ * Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
803
+ *
804
+ * \return Returns \p x - \p y.
805
+ *
806
+ * \note_accuracy_double
807
+ * \note_nofma
808
+ */
809
+ extern __device__ __device_builtin__ double __dsub_rz(double x, double y);
810
+ /**
811
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
812
+ * \brief Subtract two floating-point values in round-up mode.
813
+ *
814
+ * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
815
+ *
816
+ * \return Returns \p x - \p y.
817
+ *
818
+ * \note_accuracy_double
819
+ * \note_nofma
820
+ */
821
+ extern __device__ __device_builtin__ double __dsub_ru(double x, double y);
822
+ /**
823
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
824
+ * \brief Subtract two floating-point values in round-down mode.
825
+ *
826
+ * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
827
+ *
828
+ * \return Returns \p x - \p y.
829
+ *
830
+ * \note_accuracy_double
831
+ * \note_nofma
832
+ */
833
+ extern __device__ __device_builtin__ double __dsub_rd(double x, double y);
834
+ /**
835
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
836
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
837
+ *
838
+ * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
839
+ *
840
+ * \return Returns \p x * \p y.
841
+ *
842
+ * \note_accuracy_double
843
+ * \note_nofma
844
+ */
845
+ extern __device__ __device_builtin__ double __dmul_rn(double x, double y);
846
+ /**
847
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
848
+ * \brief Multiply two floating-point values in round-towards-zero mode.
849
+ *
850
+ * Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
851
+ *
852
+ * \return Returns \p x * \p y.
853
+ *
854
+ * \note_accuracy_double
855
+ * \note_nofma
856
+ */
857
+ extern __device__ __device_builtin__ double __dmul_rz(double x, double y);
858
+ /**
859
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
860
+ * \brief Multiply two floating-point values in round-up mode.
861
+ *
862
+ * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
863
+ *
864
+ * \return Returns \p x * \p y.
865
+ *
866
+ * \note_accuracy_double
867
+ * \note_nofma
868
+ */
869
+ extern __device__ __device_builtin__ double __dmul_ru(double x, double y);
870
+ /**
871
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
872
+ * \brief Multiply two floating-point values in round-down mode.
873
+ *
874
+ * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
875
+ *
876
+ * \return Returns \p x * \p y.
877
+ *
878
+ * \note_accuracy_double
879
+ * \note_nofma
880
+ */
881
+ extern __device__ __device_builtin__ double __dmul_rd(double x, double y);
882
+ /**
883
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
884
+ * \brief Convert a double to a float in round-to-nearest-even mode.
885
+ *
886
+ * Convert the double-precision floating-point value \p x to a single-precision
887
+ * floating-point value in round-to-nearest-even mode.
888
+ * \return Returns converted value.
889
+ */
890
+ extern __device__ __device_builtin__ float __double2float_rn(double x);
891
+ /**
892
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
893
+ * \brief Convert a double to a float in round-towards-zero mode.
894
+ *
895
+ * Convert the double-precision floating-point value \p x to a single-precision
896
+ * floating-point value in round-towards-zero mode.
897
+ * \return Returns converted value.
898
+ */
899
+ extern __device__ __device_builtin__ float __double2float_rz(double x);
900
+ /**
901
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
902
+ * \brief Convert a double to a float in round-up mode.
903
+ *
904
+ * Convert the double-precision floating-point value \p x to a single-precision
905
+ * floating-point value in round-up (to positive infinity) mode.
906
+ * \return Returns converted value.
907
+ */
908
+ extern __device__ __device_builtin__ float __double2float_ru(double x);
909
+ /**
910
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
911
+ * \brief Convert a double to a float in round-down mode.
912
+ *
913
+ * Convert the double-precision floating-point value \p x to a single-precision
914
+ * floating-point value in round-down (to negative infinity) mode.
915
+ * \return Returns converted value.
916
+ */
917
+ extern __device__ __device_builtin__ float __double2float_rd(double x);
918
+ /**
919
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
920
+ * \brief Convert a double to a signed int in round-to-nearest-even mode.
921
+ *
922
+ * Convert the double-precision floating-point value \p x to a
923
+ * signed integer value in round-to-nearest-even mode.
924
+ * \return Returns converted value.
925
+ */
926
+ extern __device__ __device_builtin__ int __double2int_rn(double x);
927
+ /**
928
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
929
+ * \brief Convert a double to a signed int in round-up mode.
930
+ *
931
+ * Convert the double-precision floating-point value \p x to a
932
+ * signed integer value in round-up (to positive infinity) mode.
933
+ * \return Returns converted value.
934
+ */
935
+ extern __device__ __device_builtin__ int __double2int_ru(double x);
936
+ /**
937
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
938
+ * \brief Convert a double to a signed int in round-down mode.
939
+ *
940
+ * Convert the double-precision floating-point value \p x to a
941
+ * signed integer value in round-down (to negative infinity) mode.
942
+ * \return Returns converted value.
943
+ */
944
+ extern __device__ __device_builtin__ int __double2int_rd(double x);
945
+ /**
946
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
947
+ * \brief Convert a double to an unsigned int in round-to-nearest-even mode.
948
+ *
949
+ * Convert the double-precision floating-point value \p x to an
950
+ * unsigned integer value in round-to-nearest-even mode.
951
+ * \return Returns converted value.
952
+ */
953
+ extern __device__ __device_builtin__ unsigned int __double2uint_rn(double x);
954
+ /**
955
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
956
+ * \brief Convert a double to an unsigned int in round-up mode.
957
+ *
958
+ * Convert the double-precision floating-point value \p x to an
959
+ * unsigned integer value in round-up (to positive infinity) mode.
960
+ * \return Returns converted value.
961
+ */
962
+ extern __device__ __device_builtin__ unsigned int __double2uint_ru(double x);
963
+ /**
964
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
965
+ * \brief Convert a double to an unsigned int in round-down mode.
966
+ *
967
+ * Convert the double-precision floating-point value \p x to an
968
+ * unsigned integer value in round-down (to negative infinity) mode.
969
+ * \return Returns converted value.
970
+ */
971
+ extern __device__ __device_builtin__ unsigned int __double2uint_rd(double x);
972
+ /**
973
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
974
+ * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
975
+ *
976
+ * Convert the double-precision floating-point value \p x to a
977
+ * signed 64-bit integer value in round-to-nearest-even mode.
978
+ * \return Returns converted value.
979
+ */
980
+ extern __device__ __device_builtin__ long long int __double2ll_rn(double x);
981
+ /**
982
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
983
+ * \brief Convert a double to a signed 64-bit int in round-up mode.
984
+ *
985
+ * Convert the double-precision floating-point value \p x to a
986
+ * signed 64-bit integer value in round-up (to positive infinity) mode.
987
+ * \return Returns converted value.
988
+ */
989
+ extern __device__ __device_builtin__ long long int __double2ll_ru(double x);
990
+ /**
991
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
992
+ * \brief Convert a double to a signed 64-bit int in round-down mode.
993
+ *
994
+ * Convert the double-precision floating-point value \p x to a
995
+ * signed 64-bit integer value in round-down (to negative infinity) mode.
996
+ * \return Returns converted value.
997
+ */
998
+ extern __device__ __device_builtin__ long long int __double2ll_rd(double x);
999
+ /**
1000
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1001
+ * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
1002
+ *
1003
+ * Convert the double-precision floating-point value \p x to an
1004
+ * unsigned 64-bit integer value in round-to-nearest-even mode.
1005
+ * \return Returns converted value.
1006
+ */
1007
+ extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
1008
+ /**
1009
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1010
+ * \brief Convert a double to an unsigned 64-bit int in round-up mode.
1011
+ *
1012
+ * Convert the double-precision floating-point value \p x to an
1013
+ * unsigned 64-bit integer value in round-up (to positive infinity) mode.
1014
+ * \return Returns converted value.
1015
+ */
1016
+ extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
1017
+ /**
1018
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1019
+ * \brief Convert a double to an unsigned 64-bit int in round-down mode.
1020
+ *
1021
+ * Convert the double-precision floating-point value \p x to an
1022
+ * unsigned 64-bit integer value in round-down (to negative infinity) mode.
1023
+ * \return Returns converted value.
1024
+ */
1025
+ extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
1026
+ /**
1027
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1028
+ * \brief Convert a signed int to a double.
1029
+ *
1030
+ * Convert the signed integer value \p x to a double-precision floating-point value.
1031
+ * \return Returns converted value.
1032
+ */
1033
+ extern __device__ __device_builtin__ double __int2double_rn(int x);
1034
+ /**
1035
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1036
+ * \brief Convert an unsigned int to a double.
1037
+ *
1038
+ * Convert the unsigned integer value \p x to a double-precision floating-point value.
1039
+ * \return Returns converted value.
1040
+ */
1041
+ extern __device__ __device_builtin__ double __uint2double_rn(unsigned int x);
1042
+ /**
1043
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1044
+ * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
1045
+ *
1046
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1047
+ * value in round-to-nearest-even mode.
1048
+ * \return Returns converted value.
1049
+ */
1050
+ extern __device__ __device_builtin__ double __ll2double_rn(long long int x);
1051
+ /**
1052
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1053
+ * \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
1054
+ *
1055
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1056
+ * value in round-towards-zero mode.
1057
+ * \return Returns converted value.
1058
+ */
1059
+ extern __device__ __device_builtin__ double __ll2double_rz(long long int x);
1060
+ /**
1061
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1062
+ * \brief Convert a signed 64-bit int to a double in round-up mode.
1063
+ *
1064
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1065
+ * value in round-up (to positive infinity) mode.
1066
+ * \return Returns converted value.
1067
+ */
1068
+ extern __device__ __device_builtin__ double __ll2double_ru(long long int x);
1069
+ /**
1070
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1071
+ * \brief Convert a signed 64-bit int to a double in round-down mode.
1072
+ *
1073
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1074
+ * value in round-down (to negative infinity) mode.
1075
+ * \return Returns converted value.
1076
+ */
1077
+ extern __device__ __device_builtin__ double __ll2double_rd(long long int x);
1078
+ /**
1079
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1080
+ * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
1081
+ *
1082
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1083
+ * value in round-to-nearest-even mode.
1084
+ * \return Returns converted value.
1085
+ */
1086
+ extern __device__ __device_builtin__ double __ull2double_rn(unsigned long long int x);
1087
+ /**
1088
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1089
+ * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
1090
+ *
1091
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1092
+ * value in round-towards-zero mode.
1093
+ * \return Returns converted value.
1094
+ */
1095
+ extern __device__ __device_builtin__ double __ull2double_rz(unsigned long long int x);
1096
+ /**
1097
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1098
+ * \brief Convert an unsigned 64-bit int to a double in round-up mode.
1099
+ *
1100
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1101
+ * value in round-up (to positive infinity) mode.
1102
+ * \return Returns converted value.
1103
+ */
1104
+ extern __device__ __device_builtin__ double __ull2double_ru(unsigned long long int x);
1105
+ /**
1106
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1107
+ * \brief Convert an unsigned 64-bit int to a double in round-down mode.
1108
+ *
1109
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1110
+ * value in round-down (to negative infinity) mode.
1111
+ * \return Returns converted value.
1112
+ */
1113
+ extern __device__ __device_builtin__ double __ull2double_rd(unsigned long long int x);
1114
+ /**
1115
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1116
+ * \brief Reinterpret high 32 bits in a double as a signed integer.
1117
+ *
1118
+ * Reinterpret the high 32 bits in the double-precision floating-point value \p x
1119
+ * as a signed integer.
1120
+ * \return Returns reinterpreted value.
1121
+ */
1122
+ extern __device__ __device_builtin__ int __double2hiint(double x);
1123
+ /**
1124
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1125
+ * \brief Reinterpret low 32 bits in a double as a signed integer.
1126
+ *
1127
+ * Reinterpret the low 32 bits in the double-precision floating-point value \p x
1128
+ * as a signed integer.
1129
+ * \return Returns reinterpreted value.
1130
+ */
1131
+ extern __device__ __device_builtin__ int __double2loint(double x);
1132
+ /**
1133
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1134
+ * \brief Reinterpret high and low 32-bit integer values as a double.
1135
+ *
1136
+ * Reinterpret the integer value of \p hi as the high 32 bits of a
1137
+ * double-precision floating-point value and the integer value of \p lo
1138
+ * as the low 32 bits of the same double-precision floating-point value.
1139
+ * \return Returns reinterpreted value.
1140
+ */
1141
+ extern __device__ __device_builtin__ double __hiloint2double(int hi, int lo);
1142
+ }
1143
+
1144
+ /*******************************************************************************
1145
+ * *
1146
+ * *
1147
+ * *
1148
+ *******************************************************************************/
1149
+
1150
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode);
1151
+
1152
+ #undef EXCLUDE_FROM_RTC
1153
+
1154
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
1155
+
1156
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
1157
+
1158
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode = cudaRoundNearest);
1159
+
1160
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode = cudaRoundZero);
1161
+
1162
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode = cudaRoundZero);
1163
+
1164
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode = cudaRoundZero);
1165
+
1166
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode = cudaRoundZero);
1167
+
1168
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode = cudaRoundNearest);
1169
+
1170
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode = cudaRoundNearest);
1171
+
1172
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode = cudaRoundNearest);
1173
+
1174
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode = cudaRoundNearest);
1175
+
1176
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode = cudaRoundNearest);
1177
+
1178
+ #undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
1179
+
1180
+
1181
+ #endif /* __cplusplus && __CUDACC__ */
1182
+
1183
+ #if !defined(__CUDACC_RTC__)
1184
+ #include "device_double_functions.hpp"
1185
+ #endif /* !__CUDACC_RTC__ */
1186
+
1187
+ #endif /* !__DEVICE_DOUBLE_FUNCTIONS_H__ */
1188
+
1189
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__)
1190
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
1191
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H__
1192
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/device_double_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/device_double_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
58
+ #endif
59
+
60
+ #if !defined(__DEVICE_DOUBLE_FUNCTIONS_HPP__)
61
+ #define __DEVICE_DOUBLE_FUNCTIONS_HPP__
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ /*******************************************************************************
72
+ * *
73
+ * *
74
+ * *
75
+ *******************************************************************************/
76
+
77
+ #if defined(__CUDACC_RTC__)
78
+ #define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
79
+ #else
80
+ #define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
81
+ #endif /* __CUDACC_RTC__ */
82
+
83
+ #include "builtin_types.h"
84
+ #include "device_types.h"
85
+ #include "host_defines.h"
86
+
87
+ /*******************************************************************************
88
+ * *
89
+ * *
90
+ * *
91
+ *******************************************************************************/
92
+
93
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode)
94
+ {
95
+ return mode == cudaRoundZero ? __fma_rz(a, b, c) :
96
+ mode == cudaRoundPosInf ? __fma_ru(a, b, c) :
97
+ mode == cudaRoundMinInf ? __fma_rd(a, b, c) :
98
+ __fma_rn(a, b, c);
99
+ }
100
+
101
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode)
102
+ {
103
+ return mode == cudaRoundZero ? __dmul_rz(a, b) :
104
+ mode == cudaRoundPosInf ? __dmul_ru(a, b) :
105
+ mode == cudaRoundMinInf ? __dmul_rd(a, b) :
106
+ __dmul_rn(a, b);
107
+ }
108
+
109
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode)
110
+ {
111
+ return mode == cudaRoundZero ? __dadd_rz(a, b) :
112
+ mode == cudaRoundPosInf ? __dadd_ru(a, b) :
113
+ mode == cudaRoundMinInf ? __dadd_rd(a, b) :
114
+ __dadd_rn(a, b);
115
+ }
116
+
117
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode)
118
+ {
119
+ return mode == cudaRoundZero ? __dsub_rz(a, b) :
120
+ mode == cudaRoundPosInf ? __dsub_ru(a, b) :
121
+ mode == cudaRoundMinInf ? __dsub_rd(a, b) :
122
+ __dsub_rn(a, b);
123
+ }
124
+
125
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode)
126
+ {
127
+ return mode == cudaRoundNearest ? __double2int_rn(a) :
128
+ mode == cudaRoundPosInf ? __double2int_ru(a) :
129
+ mode == cudaRoundMinInf ? __double2int_rd(a) :
130
+ __double2int_rz(a);
131
+ }
132
+
133
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode)
134
+ {
135
+ return mode == cudaRoundNearest ? __double2uint_rn(a) :
136
+ mode == cudaRoundPosInf ? __double2uint_ru(a) :
137
+ mode == cudaRoundMinInf ? __double2uint_rd(a) :
138
+ __double2uint_rz(a);
139
+ }
140
+
141
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode)
142
+ {
143
+ return mode == cudaRoundNearest ? __double2ll_rn(a) :
144
+ mode == cudaRoundPosInf ? __double2ll_ru(a) :
145
+ mode == cudaRoundMinInf ? __double2ll_rd(a) :
146
+ __double2ll_rz(a);
147
+ }
148
+
149
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode)
150
+ {
151
+ return mode == cudaRoundNearest ? __double2ull_rn(a) :
152
+ mode == cudaRoundPosInf ? __double2ull_ru(a) :
153
+ mode == cudaRoundMinInf ? __double2ull_rd(a) :
154
+ __double2ull_rz(a);
155
+ }
156
+
157
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode)
158
+ {
159
+ return mode == cudaRoundZero ? __ll2double_rz(a) :
160
+ mode == cudaRoundPosInf ? __ll2double_ru(a) :
161
+ mode == cudaRoundMinInf ? __ll2double_rd(a) :
162
+ __ll2double_rn(a);
163
+ }
164
+
165
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode)
166
+ {
167
+ return mode == cudaRoundZero ? __ull2double_rz(a) :
168
+ mode == cudaRoundPosInf ? __ull2double_ru(a) :
169
+ mode == cudaRoundMinInf ? __ull2double_rd(a) :
170
+ __ull2double_rn(a);
171
+ }
172
+
173
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode)
174
+ {
175
+ return (double)a;
176
+ }
177
+
178
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode)
179
+ {
180
+ return (double)a;
181
+ }
182
+
183
+ __DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode)
184
+ {
185
+ return (double)a;
186
+ }
187
+
188
+ #undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
189
+
190
+ #endif /* __cplusplus && __CUDACC__ */
191
+
192
+ #endif /* !__DEVICE_DOUBLE_FUNCTIONS_HPP__ */
193
+
194
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__)
195
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
196
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
197
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.h ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/device_functions.hpp ADDED
@@ -0,0 +1,1197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/device_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/device_functions.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
58
+ #endif
59
+
60
+ #if !defined(__DEVICE_FUNCTIONS_HPP__)
61
+ #define __DEVICE_FUNCTIONS_HPP__
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ #if defined(__CUDACC_RTC__)
72
+ #define __DEVICE_FUNCTIONS_DECL__ __device__
73
+ #define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__
74
+ #define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
75
+ #else
76
+ #define __DEVICE_FUNCTIONS_DECL__ __device__
77
+ #define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
78
+ #define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
79
+ #endif /* __CUDACC_RTC__ */
80
+
81
+ #include "builtin_types.h"
82
+ #include "device_types.h"
83
+ #include "host_defines.h"
84
+
85
+ #undef __DEVICE_FUNCTIONS_DECL__
86
+ #undef __DEVICE_FUNCTIONS_STATIC_DECL__
87
+
88
+ #endif /* __cplusplus && __CUDACC__ */
89
+
90
+ /*******************************************************************************
91
+ * *
92
+ * *
93
+ * *
94
+ *******************************************************************************/
95
+
96
+ #ifdef __CUDACC__
97
+ # if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
98
+ #define __CUDA_AND_AT_LEAST_SM_90__
99
+ #endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) */
100
+ # if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
101
+ #define __CUDA_AND_AT_LEAST_SM_70__
102
+ #endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) */
103
+ # if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
104
+ #define __CUDA_AND_AT_LEAST_SM_75__
105
+ #endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) */
106
+ #endif /* __CUDACC__ */
107
+
108
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b){
109
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
110
+ int res;
111
+ asm("{max.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
112
+ return res;
113
+ #else
114
+ // Host and older architecture code
115
+ int ans = max(a, b);
116
+
117
+ return (ans > 0) ? ans : 0;
118
+ #endif
119
+ }
120
+
121
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b){
122
+ unsigned int res;
123
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
124
+ asm("{max.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
125
+ #elif defined(__CUDA_ARCH__)
126
+ res = __vmaxs2(__vmaxs2(a, b), 0U);
127
+ #else
128
+ // Host and older architecture code
129
+ // Separate our high and low bit:
130
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
131
+ unsigned short aU_hi = (unsigned short)(a >> 16);
132
+
133
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
134
+ unsigned short bU_hi = (unsigned short)(b >> 16);
135
+
136
+ //cast to signed:
137
+ short aS_lo = *(short*)& aU_lo;
138
+ short aS_hi = *(short*)& aU_hi;
139
+
140
+ short bS_lo = *(short*)& bU_lo;
141
+ short bS_hi = *(short*)& bU_hi;
142
+
143
+ // Get answer
144
+ short ansS_lo = (short)max(aS_lo, bS_lo);
145
+ short ansS_hi = (short)max(aS_hi, bS_hi);
146
+
147
+ // relu
148
+ if(ansS_lo < 0){ ansS_lo = 0; }
149
+ if(ansS_hi < 0){ ansS_hi = 0; }
150
+
151
+ // Cast back to unsigned:
152
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
153
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
154
+
155
+ // Put answer back together:
156
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
157
+ #endif
158
+
159
+ return res;
160
+ }
161
+
162
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin_s32_relu(const int a, const int b){
163
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
164
+ int res;
165
+ asm("{min.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
166
+ return res;
167
+ #else
168
+ // Host and older architecture code
169
+ int ans = min(a, b);
170
+
171
+ return (ans > 0) ? ans : 0;
172
+ #endif
173
+ }
174
+
175
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b){
176
+ unsigned int res;
177
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
178
+ asm("{min.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
179
+ #elif defined(__CUDA_ARCH__)
180
+ res = __vmaxs2(__vmins2(a, b), 0U);
181
+ #else
182
+ // Host and older architecture code
183
+ // Separate our high and low bit:
184
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
185
+ unsigned short aU_hi = (unsigned short)(a >> 16);
186
+
187
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
188
+ unsigned short bU_hi = (unsigned short)(b >> 16);
189
+
190
+ //cast to signed:
191
+ short aS_lo = *(short*)& aU_lo;
192
+ short aS_hi = *(short*)& aU_hi;
193
+
194
+ short bS_lo = *(short*)& bU_lo;
195
+ short bS_hi = *(short*)& bU_hi;
196
+
197
+ // Get answer
198
+ short ansS_lo = (short)min(aS_lo, bS_lo);
199
+ short ansS_hi = (short)min(aS_hi, bS_hi);
200
+
201
+ // relu
202
+ if(ansS_lo < 0){ ansS_lo = 0; }
203
+ if(ansS_hi < 0){ ansS_hi = 0; }
204
+
205
+ // Cast back to unsigned:
206
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
207
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
208
+
209
+ // Put answer back together:
210
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
211
+ #endif
212
+
213
+ return res;
214
+ }
215
+
216
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32(const int a, const int b, const int c){
217
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
218
+ int res;
219
+ asm ("{.reg .s32 t1; \n\t"
220
+ "max.s32 t1, %1, %2; \n\t"
221
+ "max.s32 %0, t1, %3;}\n\t"
222
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
223
+ return res;
224
+ #else
225
+ // Host and older architecture code
226
+ return max(max(a, b), c);
227
+ #endif
228
+ }
229
+
230
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
231
+ unsigned int res;
232
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
233
+ // Future asm code (naming/syntax may change):
234
+ asm ("{.reg .b32 t1; \n\t"
235
+ "max.s16x2 t1, %1, %2; \n\t"
236
+ "max.s16x2 %0, t1, %3;}\n\t"
237
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
238
+ #elif defined(__CUDA_AND_AT_LEAST_SM_70__)
239
+ res = __vmaxs2(__vmaxs2(a, b), c);
240
+ #else
241
+ // Host and older architecture code
242
+ // Separate our high and low bit:
243
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
244
+ unsigned short aU_hi = (unsigned short)(a >> 16);
245
+
246
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
247
+ unsigned short bU_hi = (unsigned short)(b >> 16);
248
+
249
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
250
+ unsigned short cU_hi = (unsigned short)(c >> 16);
251
+
252
+ //cast to signed:
253
+ short aS_lo = *(short*)& aU_lo;
254
+ short aS_hi = *(short*)& aU_hi;
255
+
256
+ short bS_lo = *(short*)& bU_lo;
257
+ short bS_hi = *(short*)& bU_hi;
258
+
259
+ short cS_lo = *(short*)& cU_lo;
260
+ short cS_hi = *(short*)& cU_hi;
261
+
262
+ // Get answer
263
+ short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
264
+ short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);
265
+
266
+ // Cast back to unsigned:
267
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
268
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
269
+
270
+ // Put answer back together:
271
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
272
+ #endif
273
+ return res;
274
+ }
275
+
276
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
277
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
278
+ int res;
279
+ asm ("{.reg .u32 t1; \n\t"
280
+ "max.u32 t1, %1, %2; \n\t"
281
+ "max.u32 %0, t1, %3;}\n\t"
282
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
283
+ return res;
284
+ #else
285
+ // Host and older architecture code
286
+ return max(max(a, b), c);
287
+ #endif
288
+ }
289
+
290
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
291
+ unsigned int res;
292
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
293
+ asm ("{.reg .b32 t1; \n\t"
294
+ "max.u16x2 t1, %1, %2; \n\t"
295
+ "max.u16x2 %0, t1, %3;}\n\t"
296
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
297
+ #elif defined(__CUDA_ARCH__)
298
+ res = __vmaxu2(__vmaxu2(a, b), c);
299
+ #else
300
+ // Host and older architecture code
301
+ // Separate our high and low bit:
302
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
303
+ unsigned short aU_hi = (unsigned short)(a >> 16);
304
+
305
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
306
+ unsigned short bU_hi = (unsigned short)(b >> 16);
307
+
308
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
309
+ unsigned short cU_hi = (unsigned short)(c >> 16);
310
+
311
+ // Get answer
312
+ unsigned short ansU_lo = (unsigned short)max(max(aU_lo, bU_lo), cU_lo);
313
+ unsigned short ansU_hi = (unsigned short)max(max(aU_hi, bU_hi), cU_hi);
314
+
315
+ // Put answer back together:
316
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
317
+ #endif
318
+
319
+ return res;
320
+ }
321
+
322
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32(const int a, const int b, const int c){
323
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
324
+ int res;
325
+ asm ("{.reg .s32 t1; \n\t"
326
+ "min.s32 t1, %1, %2; \n\t"
327
+ "min.s32 %0, t1, %3;}\n\t"
328
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
329
+ return res;
330
+ #else
331
+ // Host and older architecture code
332
+ return min(min(a, b), c);
333
+ #endif
334
+ }
335
+
336
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
337
+ unsigned int res;
338
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
339
+ asm ("{.reg .b32 t1; \n\t"
340
+ "min.s16x2 t1, %1, %2; \n\t"
341
+ "min.s16x2 %0, t1, %3;}\n\t"
342
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
343
+ #elif defined(__CUDA_AND_AT_LEAST_SM_70__)
344
+ res = __vmins2(__vmins2(a, b), c);
345
+ #else
346
+ // Host and older architecture code
347
+ // Separate our high and low bit:
348
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
349
+ unsigned short aU_hi = (unsigned short)(a >> 16);
350
+
351
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
352
+ unsigned short bU_hi = (unsigned short)(b >> 16);
353
+
354
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
355
+ unsigned short cU_hi = (unsigned short)(c >> 16);
356
+
357
+ //cast to signed:
358
+ short aS_lo = *(short*)& aU_lo;
359
+ short aS_hi = *(short*)& aU_hi;
360
+
361
+ short bS_lo = *(short*)& bU_lo;
362
+ short bS_hi = *(short*)& bU_hi;
363
+
364
+ short cS_lo = *(short*)& cU_lo;
365
+ short cS_hi = *(short*)& cU_hi;
366
+
367
+ // Get answer
368
+ short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
369
+ short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);
370
+
371
+ // Cast back to unsigned:
372
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
373
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
374
+
375
+ // Put answer back together:
376
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
377
+ #endif
378
+
379
+ return res;
380
+ }
381
+
382
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
383
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
384
+ int res;
385
+ asm ("{.reg .u32 t1; \n\t"
386
+ "min.u32 t1, %1, %2; \n\t"
387
+ "min.u32 %0, t1, %3;}\n\t"
388
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
389
+ return res;
390
+ #else
391
+ // Host and older architecture code
392
+ return min(min(a, b), c);
393
+ #endif
394
+ }
395
+
396
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
397
+ unsigned int res;
398
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
399
+ asm ("{.reg .b32 t1; \n\t"
400
+ "min.u16x2 t1, %1, %2; \n\t"
401
+ "min.u16x2 %0, t1, %3;}\n\t"
402
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
403
+ #elif defined(__CUDA_ARCH__)
404
+ res = __vminu2(__vminu2(a, b), c);
405
+ #else
406
+ // Host and older architecture code
407
+ // Separate our high and low bit:
408
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
409
+ unsigned short aU_hi = (unsigned short)(a >> 16);
410
+
411
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
412
+ unsigned short bU_hi = (unsigned short)(b >> 16);
413
+
414
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
415
+ unsigned short cU_hi = (unsigned short)(c >> 16);
416
+
417
+ // Get answer
418
+ unsigned short ansU_lo = (unsigned short)min(min(aU_lo, bU_lo), cU_lo);
419
+ unsigned short ansU_hi = (unsigned short)min(min(aU_hi, bU_hi), cU_hi);
420
+
421
+ // Put answer back together:
422
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
423
+ #endif
424
+
425
+ return res;
426
+ }
427
+
428
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32_relu(const int a, const int b, const int c){
429
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
430
+ int res;
431
+ asm ("{.reg .s32 t1; \n\t"
432
+ "max.s32.relu t1, %1, %2; \n\t"
433
+ "max.s32.relu %0, t1, %3;}\n\t"
434
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
435
+ return res;
436
+ #else
437
+ // Host and older architecture code
438
+ int ans = max(max(a, b), c);
439
+
440
+ return (ans > 0) ? ans : 0;
441
+ #endif
442
+ }
443
+
444
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
445
+ unsigned int res;
446
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
447
+ asm ("{.reg .b32 t1; \n\t"
448
+ "max.s16x2.relu t1, %1, %2; \n\t"
449
+ "max.s16x2.relu %0, t1, %3;}\n\t"
450
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
451
+ #elif defined(__CUDA_AND_AT_LEAST_SM_75__)
452
+ res = __vimax_s16x2_relu(__vmaxs2(a, b), c);
453
+ #else
454
+ // Host and older architecture code
455
+ // Separate our high and low bit:
456
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
457
+ unsigned short aU_hi = (unsigned short)(a >> 16);
458
+
459
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
460
+ unsigned short bU_hi = (unsigned short)(b >> 16);
461
+
462
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
463
+ unsigned short cU_hi = (unsigned short)(c >> 16);
464
+
465
+ //cast to signed:
466
+ short aS_lo = *(short*)& aU_lo;
467
+ short aS_hi = *(short*)& aU_hi;
468
+
469
+ short bS_lo = *(short*)& bU_lo;
470
+ short bS_hi = *(short*)& bU_hi;
471
+
472
+ short cS_lo = *(short*)& cU_lo;
473
+ short cS_hi = *(short*)& cU_hi;
474
+
475
+ // Get answer
476
+ short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
477
+ short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);
478
+
479
+ // relu
480
+ if(ansS_lo < 0){ansS_lo = 0;}
481
+ if(ansS_hi < 0){ansS_hi = 0;}
482
+
483
+ // Cast back to unsigned:
484
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
485
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
486
+
487
+ // Put answer back together:
488
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
489
+ #endif
490
+
491
+ return res;
492
+ }
493
+
494
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32_relu(const int a, const int b, const int c){
495
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
496
+ int res;
497
+ asm ("{.reg .s32 t1; \n\t"
498
+ "min.s32.relu t1, %1, %2; \n\t"
499
+ "min.s32.relu %0, t1, %3;}\n\t"
500
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
501
+ return res;
502
+ #else
503
+ // Host and older architecture code
504
+ int ans = min(min(a, b), c);
505
+
506
+ return (ans > 0) ? ans : 0;
507
+ #endif
508
+ }
509
+
510
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
511
+ unsigned res;
512
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
513
+ asm ("{.reg .b32 t1; \n\t"
514
+ "min.s16x2.relu t1, %1, %2; \n\t"
515
+ "min.s16x2.relu %0, t1, %3;}\n\t"
516
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
517
+ #elif defined(__CUDA_AND_AT_LEAST_SM_75__)
518
+ res = __vimin_s16x2_relu(__vmins2(a, b), c);
519
+ #else
520
+ // Host and older architecture code
521
+ // Separate our high and low bit:
522
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
523
+ unsigned short aU_hi = (unsigned short)(a >> 16);
524
+
525
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
526
+ unsigned short bU_hi = (unsigned short)(b >> 16);
527
+
528
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
529
+ unsigned short cU_hi = (unsigned short)(c >> 16);
530
+
531
+ //cast to signed:
532
+ short aS_lo = *(short*)& aU_lo;
533
+ short aS_hi = *(short*)& aU_hi;
534
+
535
+ short bS_lo = *(short*)& bU_lo;
536
+ short bS_hi = *(short*)& bU_hi;
537
+
538
+ short cS_lo = *(short*)& cU_lo;
539
+ short cS_hi = *(short*)& cU_hi;
540
+
541
+ // Get answer
542
+ short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
543
+ short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);
544
+
545
+ // relu
546
+ if(ansS_lo < 0){ansS_lo = 0;}
547
+ if(ansS_hi < 0){ansS_hi = 0;}
548
+
549
+ // Cast back to unsigned:
550
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
551
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
552
+
553
+ // Put answer back together:
554
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
555
+ #endif
556
+
557
+ return res;
558
+ }
559
+
560
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32(const int a, const int b, const int c){
561
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
562
+ int res;
563
+ asm ("{.reg .s32 t1; \n\t"
564
+ "add.s32 t1, %1, %2; \n\t"
565
+ "max.s32 %0, t1, %3;}\n\t"
566
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
567
+ return res;
568
+ #else
569
+ // Host and older architecture code
570
+ return max(a + b, c);
571
+ #endif
572
+ }
573
+
574
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
575
+ unsigned int res;
576
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
577
+ asm ("{.reg .b32 t1; \n\t"
578
+ "add.s16x2 t1, %1, %2; \n\t"
579
+ "max.s16x2 %0, t1, %3;}\n\t"
580
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
581
+ #elif defined(__CUDA_ARCH__)
582
+ res = __vmaxs2(__vadd2(a, b), c);
583
+ #else
584
+ // Host and older architecture code
585
+ // Separate our high and low bit:
586
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
587
+ unsigned short aU_hi = (unsigned short)(a >> 16);
588
+
589
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
590
+ unsigned short bU_hi = (unsigned short)(b >> 16);
591
+
592
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
593
+ unsigned short cU_hi = (unsigned short)(c >> 16);
594
+
595
+ //cast to signed:
596
+ short aS_lo = *(short*)& aU_lo;
597
+ short aS_hi = *(short*)& aU_hi;
598
+
599
+ short bS_lo = *(short*)& bU_lo;
600
+ short bS_hi = *(short*)& bU_hi;
601
+
602
+ short cS_lo = *(short*)& cU_lo;
603
+ short cS_hi = *(short*)& cU_hi;
604
+
605
+ // Get answer
606
+ short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
607
+ short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);
608
+
609
+ // Cast back to unsigned:
610
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
611
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
612
+
613
+ // Put answer back together:
614
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
615
+ #endif
616
+
617
+ return res;
618
+ }
619
+
620
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c){
621
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
622
+ unsigned int res;
623
+ asm ("{.reg .u32 t1; \n\t"
624
+ "add.u32 t1, %1, %2; \n\t"
625
+ "max.u32 %0, t1, %3;}\n\t"
626
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
627
+ return res;
628
+ #else
629
+ // Host and older architecture code
630
+ return max(a + b, c);
631
+ #endif
632
+ }
633
+
634
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
635
+ unsigned int res;
636
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
637
+ asm ("{.reg .b32 t1; \n\t"
638
+ "add.u16x2 t1, %1, %2; \n\t"
639
+ "max.u16x2 %0, t1, %3;}\n\t"
640
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
641
+ #elif defined(__CUDA_ARCH__)
642
+ res = __vmaxu2(__vadd2(a, b), c);
643
+ #else
644
+ // Host and older architecture code
645
+ // Separate our high and low bit:
646
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
647
+ unsigned short aU_hi = (unsigned short)(a >> 16);
648
+
649
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
650
+ unsigned short bU_hi = (unsigned short)(b >> 16);
651
+
652
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
653
+ unsigned short cU_hi = (unsigned short)(c >> 16);
654
+
655
+ // Get answer
656
+ unsigned short ansU_lo = (unsigned short)max((unsigned short)(aU_lo + bU_lo), cU_lo);
657
+ unsigned short ansU_hi = (unsigned short)max((unsigned short)(aU_hi + bU_hi), cU_hi);
658
+
659
+ // Put answer back together:
660
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
661
+ #endif
662
+
663
+ return res;
664
+ }
665
+
666
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32(const int a, const int b, const int c){
667
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
668
+ int res;
669
+ asm ("{.reg .s32 t1; \n\t"
670
+ "add.s32 t1, %1, %2; \n\t"
671
+ "min.s32 %0, t1, %3;}\n\t"
672
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
673
+ return res;
674
+ #else
675
+ // Host and older architecture code
676
+ return min(a + b, c);
677
+ #endif
678
+ }
679
+
680
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
681
+ unsigned int res;
682
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
683
+ asm ("{.reg .b32 t1; \n\t"
684
+ "add.s16x2 t1, %1, %2; \n\t"
685
+ "min.s16x2 %0, t1, %3;}\n\t"
686
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
687
+ #elif defined(__CUDA_ARCH__)
688
+ res = __vmins2(__vadd2(a, b), c);
689
+ #else
690
+ // Host and older architecture code
691
+ // Separate our high and low bit:
692
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
693
+ unsigned short aU_hi = (unsigned short)(a >> 16);
694
+
695
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
696
+ unsigned short bU_hi = (unsigned short)(b >> 16);
697
+
698
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
699
+ unsigned short cU_hi = (unsigned short)(c >> 16);
700
+
701
+ //cast to signed:
702
+ short aS_lo = *(short*)& aU_lo;
703
+ short aS_hi = *(short*)& aU_hi;
704
+
705
+ short bS_lo = *(short*)& bU_lo;
706
+ short bS_hi = *(short*)& bU_hi;
707
+
708
+ short cS_lo = *(short*)& cU_lo;
709
+ short cS_hi = *(short*)& cU_hi;
710
+
711
+ // Get answer
712
+ short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
713
+ short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);
714
+
715
+ // Cast back to unsigned:
716
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
717
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
718
+
719
+ // Put answer back together:
720
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
721
+ #endif
722
+
723
+ return res;
724
+ }
725
+
726
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c){
727
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
728
+ unsigned int res;
729
+ asm ("{.reg .u32 t1; \n\t"
730
+ "add.u32 t1, %1, %2; \n\t"
731
+ "min.u32 %0, t1, %3;}\n\t"
732
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
733
+ return res;
734
+ #else
735
+ // Host and older architecture code
736
+ return min(a + b, c);
737
+ #endif
738
+ }
739
+
740
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
741
+ unsigned int res;
742
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
743
+ asm ("{.reg .b32 t1; \n\t"
744
+ "add.u16x2 t1, %1, %2; \n\t"
745
+ "min.u16x2 %0, t1, %3;}\n\t"
746
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
747
+ #elif defined(__CUDA_ARCH__)
748
+ res = __vminu2(__vadd2(a, b), c);
749
+ #else
750
+ // Host and older architecture code
751
+ // Separate our high and low bit:
752
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
753
+ unsigned short aU_hi = (unsigned short)(a >> 16);
754
+
755
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
756
+ unsigned short bU_hi = (unsigned short)(b >> 16);
757
+
758
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
759
+ unsigned short cU_hi = (unsigned short)(c >> 16);
760
+
761
+ // Get answer
762
+ unsigned short ansU_lo = (unsigned short)min((unsigned short)(aU_lo + bU_lo), cU_lo);
763
+ unsigned short ansU_hi = (unsigned short)min((unsigned short)(aU_hi + bU_hi), cU_hi);
764
+
765
+ // Put answer back together:
766
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
767
+ #endif
768
+
769
+ return res;
770
+ }
771
+
772
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32_relu(const int a, const int b, const int c){
773
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
774
+ int res;
775
+ asm ("{.reg .s32 t1; \n\t"
776
+ "add.s32 t1, %1, %2; \n\t"
777
+ "max.s32.relu %0, t1, %3;}\n\t"
778
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
779
+ return res;
780
+ #else
781
+ // Host and older architecture code
782
+ int ans = max(a + b, c);
783
+
784
+ return (ans > 0) ? ans : 0;
785
+ #endif
786
+ }
787
+
788
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
789
+ unsigned int res;
790
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
791
+ asm ("{.reg .b32 t1; \n\t"
792
+ "add.s16x2 t1, %1, %2; \n\t"
793
+ "max.s16x2.relu %0, t1, %3;}\n\t"
794
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
795
+ #elif defined(__CUDA_ARCH__)
796
+ res = __vimax_s16x2_relu(__vadd2(a, b), c);
797
+ #else
798
+ // Host and older architecture code
799
+ // Separate our high and low bit:
800
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
801
+ unsigned short aU_hi = (unsigned short)(a >> 16);
802
+
803
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
804
+ unsigned short bU_hi = (unsigned short)(b >> 16);
805
+
806
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
807
+ unsigned short cU_hi = (unsigned short)(c >> 16);
808
+
809
+ //cast to signed:
810
+ short aS_lo = *(short*)& aU_lo;
811
+ short aS_hi = *(short*)& aU_hi;
812
+
813
+ short bS_lo = *(short*)& bU_lo;
814
+ short bS_hi = *(short*)& bU_hi;
815
+
816
+ short cS_lo = *(short*)& cU_lo;
817
+ short cS_hi = *(short*)& cU_hi;
818
+
819
+ // Get answer
820
+ short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
821
+ short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);
822
+
823
+ if(ansS_lo < 0){ansS_lo = 0;}
824
+ if(ansS_hi < 0){ansS_hi = 0;}
825
+
826
+ // Cast back to unsigned:
827
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
828
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
829
+
830
+ // Put answer back together:
831
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
832
+ #endif
833
+
834
+ return res;
835
+ }
836
+
837
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32_relu(const int a, const int b, const int c){
838
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
839
+ int res;
840
+ asm ("{.reg .s32 t1; \n\t"
841
+ "add.s32 t1, %1, %2; \n\t"
842
+ "min.s32.relu %0, t1, %3;}\n\t"
843
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
844
+ return res;
845
+ #else
846
+ // Host and older architecture code
847
+ int ans = min(a + b, c);
848
+
849
+ return (ans > 0) ? ans : 0;
850
+ #endif
851
+ }
852
+
853
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
854
+ unsigned int res;
855
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
856
+ asm ("{.reg .b32 t1; \n\t"
857
+ "add.s16x2 t1, %1, %2; \n\t"
858
+ "min.s16x2.relu %0, t1, %3;}\n\t"
859
+ : "=r"(res) : "r"(a), "r"(b), "r"(c));
860
+ #elif defined(__CUDA_ARCH__)
861
+ res = __vimin_s16x2_relu(__vadd2(a, b), c);
862
+ #else
863
+ // Host and older architecture code
864
+ // Separate our high and low bit:
865
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
866
+ unsigned short aU_hi = (unsigned short)(a >> 16);
867
+
868
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
869
+ unsigned short bU_hi = (unsigned short)(b >> 16);
870
+
871
+ unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
872
+ unsigned short cU_hi = (unsigned short)(c >> 16);
873
+
874
+ //cast to signed:
875
+ short aS_lo = *(short*)& aU_lo;
876
+ short aS_hi = *(short*)& aU_hi;
877
+
878
+ short bS_lo = *(short*)& bU_lo;
879
+ short bS_hi = *(short*)& bU_hi;
880
+
881
+ short cS_lo = *(short*)& cU_lo;
882
+ short cS_hi = *(short*)& cU_hi;
883
+
884
+ // Get answer
885
+ short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
886
+ short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);
887
+
888
+ if(ansS_lo < 0){ansS_lo = 0;}
889
+ if(ansS_hi < 0){ansS_hi = 0;}
890
+
891
+ // Cast back to unsigned:
892
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
893
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
894
+
895
+ // Put answer back together:
896
+ res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
897
+ #endif
898
+
899
+ return res;
900
+ }
901
+
902
+ // vimax vimin with predicate
903
+ // *pred gets set to '(a >= b)'
904
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmax_s32(const int a, const int b, bool* const pred){
905
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
906
+ int val;
907
+ unsigned int predicate_local;
908
+ asm ("{ .reg .pred __$temp1;\n\t"
909
+ " setp.ge.s32 __$temp1, %2, %3;\n\t"
910
+ " selp.s32 %0, %2, %3, __$temp1;\n\t"
911
+ " selp.s32 %1, 1, 0, __$temp1;}\n\t"
912
+ : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
913
+
914
+ *pred = (bool)predicate_local;
915
+ return val;
916
+ #else
917
+ // Host and older architecture code
918
+ int ans = max(a, b);
919
+
920
+ *pred = (a >= b);
921
+ return ans;
922
+ #endif
923
+ }
924
+
925
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred){
926
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
927
+ unsigned int val;
928
+ unsigned int predicate_local;
929
+ asm ("{ .reg .pred __$temp1;\n\t"
930
+ " setp.ge.u32 __$temp1, %2, %3;\n\t"
931
+ " selp.u32 %0, %2, %3, __$temp1;\n\t"
932
+ " selp.u32 %1, 1, 0, __$temp1;}\n\t"
933
+ : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
934
+
935
+ *pred = (bool)predicate_local;
936
+ return val;
937
+ #else
938
+ // Host and older architecture code
939
+ unsigned int ans = max(a, b);
940
+
941
+ *pred = (a >= b);
942
+ return ans;
943
+ #endif
944
+ }
945
+
946
+ // *pred gets set to '(a <= b)'
947
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmin_s32(const int a, const int b, bool* const pred){
948
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
949
+ int val;
950
+ unsigned int predicate_local;
951
+ asm ("{ .reg .pred __$temp1;\n\t"
952
+ " setp.le.s32 __$temp1, %2, %3;\n\t"
953
+ " selp.s32 %0, %2, %3, __$temp1;\n\t"
954
+ " selp.s32 %1, 1, 0, __$temp1;}\n\t"
955
+ : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
956
+
957
+ *pred = (bool)predicate_local;
958
+ return val;
959
+ #else
960
+ // Host and older architecture code
961
+ int ans = min(a, b);
962
+
963
+ *pred = (a <= b);
964
+ return ans;
965
+ #endif
966
+ }
967
+
968
+ // *pred gets set to '(a <= b)'
969
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred){
970
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
971
+ unsigned int val;
972
+ unsigned int predicate_local;
973
+ asm ("{ .reg .pred __$temp1;\n\t"
974
+ " setp.le.u32 __$temp1, %2, %3;\n\t"
975
+ " selp.u32 %0, %2, %3, __$temp1;\n\t"
976
+ " selp.u32 %1, 1, 0, __$temp1;}\n\t"
977
+ : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));
978
+
979
+ *pred = (bool)predicate_local;
980
+ return val;
981
+ #else
982
+ // Host and older architecture code
983
+ unsigned int ans = min(a, b);
984
+
985
+ *pred = (a <= b);
986
+ return ans;
987
+ #endif
988
+ }
989
+
990
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
991
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
992
+ unsigned int val;
993
+ unsigned int predicate_local_hi;
994
+ unsigned int predicate_local_lo;
995
+ asm ("{.reg .pred pu, pv; \n\t"
996
+ ".reg .s16 rs0, rs1, rs2, rs3; \n\t"
997
+ "max.s16x2 %0, %3, %4; \n\t"
998
+ "mov.b32 {rs0, rs1}, %0; \n\t"
999
+ "mov.b32 {rs2, rs3}, %3; \n\t"
1000
+ "setp.eq.s16 pv, rs0, rs2; \n\t"
1001
+ "setp.eq.s16 pu, rs1, rs3; \n\t"
1002
+ "selp.b32 %1, 1, 0, pu; \n\t"
1003
+ "selp.b32 %2, 1, 0, pv;} \n\t"
1004
+ : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
1005
+
1006
+ *pred_hi = (bool)predicate_local_hi;
1007
+ *pred_lo = (bool)predicate_local_lo;
1008
+ return val;
1009
+ #else
1010
+ // Host and older architecture code
1011
+ // Separate our high and low bit:
1012
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
1013
+ unsigned short aU_hi = (unsigned short)(a >> 16);
1014
+
1015
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
1016
+ unsigned short bU_hi = (unsigned short)(b >> 16);
1017
+
1018
+ //cast to signed:
1019
+ short aS_lo = *(short*)& aU_lo;
1020
+ short aS_hi = *(short*)& aU_hi;
1021
+
1022
+ short bS_lo = *(short*)& bU_lo;
1023
+ short bS_hi = *(short*)& bU_hi;
1024
+
1025
+ // Get answer
1026
+ short ansS_lo = (short)max(aS_lo, bS_lo);
1027
+ short ansS_hi = (short)max(aS_hi, bS_hi);
1028
+
1029
+ *pred_hi = (aS_hi >= bS_hi);
1030
+ *pred_lo = (aS_lo >= bS_lo);
1031
+
1032
+ // Cast back to unsigned:
1033
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
1034
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
1035
+
1036
+ // Put answer back together:
1037
+ unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
1038
+
1039
+ return ans;
1040
+ #endif
1041
+ }
1042
+
1043
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
1044
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
1045
+ unsigned int val;
1046
+ unsigned int predicate_local_hi;
1047
+ unsigned int predicate_local_lo;
1048
+ asm ("{.reg .pred pu, pv; \n\t"
1049
+ ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
1050
+ "max.u16x2 %0, %3, %4; \n\t"
1051
+ "mov.b32 {rs0, rs1}, %0; \n\t"
1052
+ "mov.b32 {rs2, rs3}, %3; \n\t"
1053
+ "setp.eq.u16 pv, rs0, rs2; \n\t"
1054
+ "setp.eq.u16 pu, rs1, rs3; \n\t"
1055
+ "selp.b32 %1, 1, 0, pu; \n\t"
1056
+ "selp.b32 %2, 1, 0, pv;} \n\t"
1057
+ : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
1058
+
1059
+ *pred_hi = (bool)predicate_local_hi;
1060
+ *pred_lo = (bool)predicate_local_lo;
1061
+ return val;
1062
+ #else
1063
+ // Host and older architecture code
1064
+ // Separate our high and low bit:
1065
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
1066
+ unsigned short aU_hi = (unsigned short)(a >> 16);
1067
+
1068
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
1069
+ unsigned short bU_hi = (unsigned short)(b >> 16);
1070
+
1071
+ // Get answer
1072
+ unsigned short ansU_lo = (unsigned short)max(aU_lo, bU_lo);
1073
+ unsigned short ansU_hi = (unsigned short)max(aU_hi, bU_hi);
1074
+
1075
+ *pred_hi = (aU_hi >= bU_hi);
1076
+ *pred_lo = (aU_lo >= bU_lo);
1077
+
1078
+ // Put answer back together:
1079
+ unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
1080
+
1081
+ return ans;
1082
+ #endif
1083
+ }
1084
+
1085
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
1086
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
1087
+ unsigned int val;
1088
+ unsigned int predicate_local_hi;
1089
+ unsigned int predicate_local_lo;
1090
+ asm ("{.reg .pred pu, pv; \n\t"
1091
+ ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
1092
+ "min.s16x2 %0, %3, %4; \n\t"
1093
+ "mov.b32 {rs0, rs1}, %0; \n\t"
1094
+ "mov.b32 {rs2, rs3}, %3; \n\t"
1095
+ "setp.eq.s16 pv, rs0, rs2; \n\t"
1096
+ "setp.eq.s16 pu, rs1, rs3; \n\t"
1097
+ "selp.b32 %1, 1, 0, pu; \n\t"
1098
+ "selp.b32 %2, 1, 0, pv;} \n\t"
1099
+ : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
1100
+
1101
+ *pred_hi = (bool)predicate_local_hi;
1102
+ *pred_lo = (bool)predicate_local_lo;
1103
+ return val;
1104
+ #else
1105
+ // Host and older architecture code
1106
+ // Separate our high and low bit:
1107
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
1108
+ unsigned short aU_hi = (unsigned short)(a >> 16);
1109
+
1110
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
1111
+ unsigned short bU_hi = (unsigned short)(b >> 16);
1112
+
1113
+ //cast to signed:
1114
+ short aS_lo = *(short*)& aU_lo;
1115
+ short aS_hi = *(short*)& aU_hi;
1116
+
1117
+ short bS_lo = *(short*)& bU_lo;
1118
+ short bS_hi = *(short*)& bU_hi;
1119
+
1120
+ // Get answer
1121
+ short ansS_lo = (short)min(aS_lo, bS_lo);
1122
+ short ansS_hi = (short)min(aS_hi, bS_hi);
1123
+
1124
+ *pred_hi = (aS_hi <= bS_hi);
1125
+ *pred_lo = (aS_lo <= bS_lo);
1126
+
1127
+ // Cast back to unsigned:
1128
+ unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
1129
+ unsigned short ansU_hi = *(unsigned short*)& ansS_hi;
1130
+
1131
+ // Put answer back together:
1132
+ unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
1133
+
1134
+ return ans;
1135
+ #endif
1136
+ }
1137
+
1138
+ __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
1139
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
1140
+ unsigned int val;
1141
+ unsigned int predicate_local_hi;
1142
+ unsigned int predicate_local_lo;
1143
+ asm ("{.reg .pred pu, pv; \n\t"
1144
+ ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
1145
+ "min.u16x2 %0, %3, %4; \n\t"
1146
+ "mov.b32 {rs0, rs1}, %0; \n\t"
1147
+ "mov.b32 {rs2, rs3}, %3; \n\t"
1148
+ "setp.eq.u16 pv, rs0, rs2; \n\t"
1149
+ "setp.eq.u16 pu, rs1, rs3; \n\t"
1150
+ "selp.b32 %1, 1, 0, pu; \n\t"
1151
+ "selp.b32 %2, 1, 0, pv;} \n\t"
1152
+ : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));
1153
+
1154
+ *pred_hi = (bool)predicate_local_hi;
1155
+ *pred_lo = (bool)predicate_local_lo;
1156
+ return val;
1157
+ #else
1158
+ // Host and older architecture code
1159
+ // Separate our high and low bit:
1160
+ unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
1161
+ unsigned short aU_hi = (unsigned short)(a >> 16);
1162
+
1163
+ unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
1164
+ unsigned short bU_hi = (unsigned short)(b >> 16);
1165
+
1166
+ // Get answer
1167
+ unsigned short ansU_lo = (unsigned short)min(aU_lo, bU_lo);
1168
+ unsigned short ansU_hi = (unsigned short)min(aU_hi, bU_hi);
1169
+
1170
+ *pred_hi = (aU_hi <= bU_hi);
1171
+ *pred_lo = (aU_lo <= bU_lo);
1172
+
1173
+ // Put answer back together:
1174
+ unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
1175
+
1176
+ return ans;
1177
+ #endif
1178
+ }
1179
+
1180
+ #ifdef __CUDA_AND_AT_LEAST_SM_90__
1181
+ #undef __CUDA_AND_AT_LEAST_SM_90__
1182
+ #endif
1183
+
1184
+ #undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__
1185
+
1186
+ /*******************************************************************************
1187
+ * *
1188
+ * *
1189
+ * *
1190
+ *******************************************************************************/
1191
+
1192
+ #endif /* !__DEVICE_FUNCTIONS_HPP__ */
1193
+
1194
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__)
1195
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
1196
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
1197
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/func_macro.h ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * NVIDIA_COPYRIGHT_BEGIN
3
+ *
4
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION. All rights reserved.
5
+ *
6
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
7
+ * and proprietary rights in and to this software, related documentation
8
+ * and any modifications thereto. Any use, reproduction, disclosure or
9
+ * distribution of this software and related documentation without an express
10
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
11
+ *
12
+ * NVIDIA_COPYRIGHT_END
13
+ */
14
+
15
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
16
+ #if defined(_MSC_VER)
17
+ #pragma message("crt/func_macro.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
18
+ #else
19
+ #warning "crt/func_macro.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
20
+ #endif
21
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
22
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
23
+ #endif
24
+
25
+ #if !defined(__FUNC_MACRO_H__)
26
+ #define __FUNC_MACRO_H__
27
+
28
+ #if !defined(__CUDA_INTERNAL_COMPILATION__)
29
+
30
+ #error -- incorrect inclusion of a cudart header file
31
+
32
+ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
33
+
34
+ #if defined(__GNUC__)
35
+
36
+ #define __func__(decl) \
37
+ inline decl
38
+
39
+ #define __device_func__(decl) \
40
+ static __attribute__((__unused__)) decl
41
+
42
+ #elif defined(_WIN32)
43
+
44
+ #define __func__(decl) \
45
+ static inline decl
46
+
47
+ #define __device_func__(decl) \
48
+ static decl
49
+
50
+ #endif /* __GNUC__ */
51
+
52
+ #endif /* __FUNC_MACRO_H__ */
53
+
54
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__)
55
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
56
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
57
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_config.h ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2024 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/host_config.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/host_config.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
58
+ #endif
59
+
60
+ #if !defined(__HOST_CONFIG_H__)
61
+ #define __HOST_CONFIG_H__
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #if defined(__CUDACC__)
70
+
71
+ #if defined(__CUDACC_RTC__)
72
+
73
+ #define _CRTIMP
74
+ #define __THROW
75
+
76
+ #else /* __CUDACC_RTC__ */
77
+
78
+ /* check for host compilers that are compatible with nvcc */
79
+ #if !defined(__GNUC__) && !defined(_WIN32)
80
+
81
+ #error --- !!! UNSUPPORTED COMPILER !!! ---
82
+
83
+ #endif /* !__GNUC__ && !_WIN32 */
84
+
85
+ /* check invalid configurations */
86
+ #if defined(__PGIC__)
87
+ #if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
88
+ #error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
89
+ #endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
90
+ #endif /* defined(__PGIC__) */
91
+
92
+ #if defined(__powerpc__)
93
+ #if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
94
+ #error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
95
+ #endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
96
+ #endif /* __powerpc__ */
97
+
98
+ #if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
99
+ #error -- clang and clang++ are the only supported host compilers on Mac OS X!
100
+ #endif /* __APPLE__ && __MACH__ && !__clang__ */
101
+
102
+
103
+ /* check host compiler version */
104
+ #if !__NV_NO_HOST_COMPILER_CHECK
105
+
106
+ #if defined(__ICC)
107
+
108
+ #if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
109
+
110
+ #error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
111
+
112
+ #endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
113
+
114
+ #endif /* __ICC */
115
+
116
+ #if defined(__GRCO_CLANG_COMPILER__)
117
+ #if (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 17))
118
+ #error -- unsupported Grace clang version! The version must be 16.x to 17.x. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
119
+ #endif /* (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 17)) */
120
+
121
+ #endif /* __GRCO_CLANG_COMPILER__ */
122
+
123
+ #if defined(__INTEL_CLANG_COMPILER)
124
+ #error -- unsupported Intel ICX compiler! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
125
+ #endif /* __INTEL_CLANG_COMPILER */
126
+
127
+ #if defined(__powerpc__)
128
+
129
+ #if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
130
+ !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
131
+
132
+ #error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
133
+
134
+ #endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
135
+ !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
136
+
137
+ #endif /* __powerpc__ */
138
+
139
+ #if defined(__GNUC__)
140
+
141
+ #if __GNUC__ > 13
142
+
143
+ #error -- unsupported GNU version! gcc versions later than 13 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
144
+
145
+ #endif /* __GNUC__ > 13 */
146
+
147
+
148
+ #if defined(__HORIZON__)
149
+ #if (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3))
150
+ #error -- unsupported HOS clang version! The version must be must be less than 18 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
151
+ #endif /* (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3)) */
152
+ #endif /* __HORIZON__ */
153
+
154
+ #if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__)
155
+
156
+ #if (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3))
157
+ #error -- unsupported clang version! clang version must be less than 18 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
158
+
159
+ #endif /* (__clang_major__ >= 18) || (__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 3)) */
160
+
161
+ #endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__) */
162
+
163
+
164
+ #endif /* __GNUC__ */
165
+
166
+ #if defined(_WIN32)
167
+
168
+ #if _MSC_VER < 1910 || _MSC_VER >= 1950
169
+
170
+ #error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
171
+
172
+ #elif _MSC_VER >= 1910 && _MSC_VER < 1910
173
+
174
+ #pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
175
+
176
+ #endif /* (_MSC_VER < 1910 || _MSC_VER >= 1950) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
177
+
178
+ #endif /* _WIN32 */
179
+ #endif /* !__NV_NO_HOST_COMPILER_CHECK */
180
+
181
+
182
+ /* configure host compiler */
183
+ #if defined(__APPLE__)
184
+
185
+ #define _CRTIMP
186
+ #define _ACRTIMP
187
+ #define __THROW
188
+
189
+ #if defined(__BLOCKS__) /* nvcc does not support closures */
190
+
191
+ #undef __BLOCKS__
192
+
193
+ #endif /* __BLOCKS__ */
194
+
195
+ #elif defined(__ANDROID__)
196
+
197
+ #define _CRTIMP
198
+ #define _ACRTIMP
199
+ #define __THROW
200
+
201
+ #elif defined(__QNX__)
202
+
203
+ #define _CRTIMP
204
+ #define _ACRTIMP
205
+ #define __THROW
206
+
207
+ #elif defined(__HORIZON__)
208
+
209
+ #define _CRTIMP
210
+ #define _ACRTIMP
211
+ #define __THROW
212
+
213
+ #elif defined(__GNUC__)
214
+
215
+ #define _CRTIMP
216
+ #define _ACRTIMP
217
+
218
+ #include <features.h> /* for __THROW */
219
+
220
+ #elif defined(_WIN32)
221
+
222
+ #if _MSC_VER >= 1500
223
+
224
+ #undef _USE_DECLSPECS_FOR_SAL
225
+ #define _USE_DECLSPECS_FOR_SAL \
226
+ 1
227
+
228
+ #endif /* _MSC_VER >= 1500 */
229
+
230
+ #if !defined(_CRT_NONSTDC_NO_WARNINGS)
231
+
232
+ #define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
233
+
234
+ #endif /* !_CRT_NONSTDC_NO_WARNINGS */
235
+
236
+ #if !defined(_CRT_SECURE_NO_WARNINGS)
237
+
238
+ #define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
239
+
240
+ #endif /* !_CRT_SECURE_NO_WARNINGS */
241
+
242
+ #if !defined(NOMINMAX)
243
+
244
+ #define NOMINMAX /* min and max are part of cuda runtime */
245
+
246
+ #endif /* !NOMINMAX */
247
+
248
+ #include <crtdefs.h> /* for _CRTIMP */
249
+ #if _MSC_VER >= 1900
250
+ #include <corecrt.h> /* for _ACRTIMP */
251
+ #endif /* _MSC_VER >= 1900 */
252
+
253
+ #define __THROW
254
+
255
+ #endif /* __APPLE__ */
256
+
257
+ #endif /* __CUDACC_RTC__ */
258
+
259
+
260
+ #if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
261
+
262
+ #if __CUDACC_RTC__
263
+ typedef char *va_list;
264
+ #else /* !__CUDACC_RTC__ */
265
+ #include <cstdarg>
266
+ #endif /* __CUDACC_RTC__ */
267
+
268
+
269
+ #undef va_start
270
+ #undef va_end
271
+ #undef va_arg
272
+
273
+ #ifdef __PGIC__
274
+
275
+ #undef __builtin_va_end
276
+
277
+ #define va_start(v,l) __builtin_alt_va_start(v,l)
278
+ #define va_end(v) __builtin_va_end(v)
279
+ #define va_arg(v,l) __builtin_alt_va_arg(v,l)
280
+
281
+ #if (__cplusplus >= 201103L)
282
+ #undef va_copy
283
+ #define va_copy(d,s) __builtin_va_copy(d,s)
284
+ #endif
285
+
286
+ #else /* !__PGIC__ */
287
+
288
+
289
+ #define va_start(ap, x) (__cu_va_start(&ap, x))
290
+ #define va_end(ap) (__cu_va_end(&ap))
291
+ #define va_arg(ap, t) (*((t *)__cu_va_arg(&ap, (t *)0)))
292
+
293
+ #if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
294
+ #undef va_copy
295
+ #define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
296
+ #endif /* (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
297
+ #endif /* __PGIC__ */
298
+
299
+ #endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
300
+
301
+
302
+
303
+ #endif /* __CUDACC__ */
304
+
305
+ #endif /* !__HOST_CONFIG_H__ */
306
+
307
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
308
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
309
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
310
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_defines.h ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/host_defines.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/host_defines.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
58
+ #endif
59
+
60
+ #if !defined(__HOST_DEFINES_H__)
61
+ #define __HOST_DEFINES_H__
62
+
63
+ #if defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDADEVRT_INTERNAL__) && !defined(_ALLOW_UNSUPPORTED_LIBCPP)
64
+ #include <ctype.h>
65
+ #if ((defined(_MSC_VER ) && (defined(_M_X64) || defined(_M_AMD64))) ||\
66
+ (defined(__x86_64__) || defined(__amd64__))) && defined(_LIBCPP_VERSION) && !(defined(__HORIZON__) || defined(__ANDROID__) || defined(__QNX__))
67
+ #error "libc++ is not supported on x86 system"
68
+ #endif
69
+ #endif
70
+
71
+ /* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
72
+ #if defined(__GNUC__) || (defined(__PGIC__) && defined(__linux__)) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
73
+
74
+ #if defined(__CUDACC_RTC__)
75
+ #define __volatile__ volatile
76
+ #endif /* __CUDACC_RTC__ */
77
+
78
+ #define __no_return__ \
79
+ __attribute__((noreturn))
80
+
81
+ #if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
82
+ /* gcc allows users to define attributes with underscores,
83
+ e.g., __attribute__((__noinline__)).
84
+ Consider a non-CUDA source file (e.g. .cpp) that has the
85
+ above attribute specification, and includes this header file. In that case,
86
+ defining __noinline__ as below would cause a gcc compilation error.
87
+ Hence, only define __noinline__ when the code is being processed
88
+ by a CUDA compiler component.
89
+ */
90
+ #define __noinline__ \
91
+ __attribute__((noinline))
92
+ #endif /* __CUDACC__ || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
93
+
94
+ #undef __forceinline__
95
+ #define __forceinline__ \
96
+ __inline__ __attribute__((always_inline))
97
+ #define __inline_hint__ \
98
+ __attribute__((nv_inline_hint))
99
+ #define __align__(n) \
100
+ __attribute__((aligned(n)))
101
+ #define __maxnreg__(a) \
102
+ __attribute__((maxnreg(a)))
103
+ #define __thread__ \
104
+ __thread
105
+ #define __import__
106
+ #define __export__
107
+ #define __cdecl
108
+ #define __annotate__(a) \
109
+ __attribute__((a))
110
+ #define __location__(a) \
111
+ __annotate__(a)
112
+ #define CUDARTAPI
113
+ #define CUDARTAPI_CDECL
114
+
115
+ #elif defined(_MSC_VER)
116
+
117
+ #if _MSC_VER >= 1400
118
+
119
+ #define __restrict__ \
120
+ __restrict
121
+
122
+ #else /* _MSC_VER >= 1400 */
123
+
124
+ #define __restrict__
125
+
126
+ #endif /* _MSC_VER >= 1400 */
127
+
128
+ #define __inline__ \
129
+ __inline
130
+ #define __no_return__ \
131
+ __declspec(noreturn)
132
+ #define __noinline__ \
133
+ __declspec(noinline)
134
+ #define __forceinline__ \
135
+ __forceinline
136
+ #define __inline_hint__ \
137
+ __declspec(nv_inline_hint)
138
+ #define __align__(n) \
139
+ __declspec(align(n))
140
+ #define __maxnreg__(n) \
141
+ __declspec(maxnreg(n))
142
+ #define __thread__ \
143
+ __declspec(thread)
144
+ #define __import__ \
145
+ __declspec(dllimport)
146
+ #define __export__ \
147
+ __declspec(dllexport)
148
+ #define __annotate__(a) \
149
+ __declspec(a)
150
+ #define __location__(a) \
151
+ __annotate__(__##a##__)
152
+ #define CUDARTAPI \
153
+ __stdcall
154
+ #define CUDARTAPI_CDECL \
155
+ __cdecl
156
+
157
+ #else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
158
+
159
+ #define __inline__
160
+
161
+ #if !defined(__align__)
162
+
163
+ #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
164
+
165
+ #endif /* !__align__ */
166
+
167
+ #if !defined(CUDARTAPI)
168
+
169
+ #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
170
+
171
+ #endif /* !CUDARTAPI */
172
+
173
+ #endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
174
+
175
+ #if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
176
+ (defined(_MSC_VER) && _MSC_VER < 1900) || \
177
+ (!defined(__GNUC__) && !defined(_MSC_VER))
178
+
179
+ #define __specialization_static \
180
+ static
181
+
182
+ #else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
183
+ (_MSC_VER && _MSC_VER < 1900) ||
184
+ (!__GNUC__ && !_MSC_VER) */
185
+
186
+ #define __specialization_static
187
+
188
+ #endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
189
+ (_MSC_VER && _MSC_VER < 1900) ||
190
+ (!__GNUC__ && !_MSC_VER) */
191
+
192
+ #if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
193
+
194
+ #undef __annotate__
195
+ #define __annotate__(a)
196
+
197
+ #else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
198
+
199
+ #define __launch_bounds__(...) \
200
+ __annotate__(launch_bounds(__VA_ARGS__))
201
+
202
+ #endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
203
+
204
+ #if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
205
+ defined(__GNUC__) || defined(_WIN64)
206
+
207
+ #define __builtin_align__(a) \
208
+ __align__(a)
209
+
210
+ #else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
211
+
212
+ #define __builtin_align__(a)
213
+
214
+ #endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
215
+
216
+ #if defined(__CUDACC__) || !defined(__grid_constant__)
217
+ #define __grid_constant__ \
218
+ __location__(grid_constant)
219
+ #endif /* defined(__CUDACC__) || !defined(__grid_constant__) */
220
+
221
+ #if defined(__CUDACC__) || !defined(__host__)
222
+ #define __host__ \
223
+ __location__(host)
224
+ #endif /* defined(__CUDACC__) || !defined(__host__) */
225
+ #if defined(__CUDACC__) || !defined(__device__)
226
+ #define __device__ \
227
+ __location__(device)
228
+ #endif /* defined(__CUDACC__) || !defined(__device__) */
229
+ #if defined(__CUDACC__) || !defined(__global__)
230
+ #define __global__ \
231
+ __location__(global)
232
+ #endif /* defined(__CUDACC__) || !defined(__global__) */
233
+ #if defined(__CUDACC__) || !defined(__shared__)
234
+ #define __shared__ \
235
+ __location__(shared)
236
+ #endif /* defined(__CUDACC__) || !defined(__shared__) */
237
+ #if defined(__CUDACC__) || !defined(__constant__)
238
+ #define __constant__ \
239
+ __location__(constant)
240
+ #endif /* defined(__CUDACC__) || !defined(__constant__) */
241
+ #if defined(__CUDACC__) || !defined(__managed__)
242
+ #define __managed__ \
243
+ __location__(managed)
244
+ #endif /* defined(__CUDACC__) || !defined(__managed__) */
245
+
246
+ #if !defined(__CUDACC__)
247
+ #define __device_builtin__
248
+ #define __device_builtin_texture_type__
249
+ #define __device_builtin_surface_type__
250
+ #define __cudart_builtin__
251
+ #else /* defined(__CUDACC__) */
252
+ #define __device_builtin__ \
253
+ __location__(device_builtin)
254
+ #define __device_builtin_texture_type__ \
255
+ __location__(device_builtin_texture_type)
256
+ #define __device_builtin_surface_type__ \
257
+ __location__(device_builtin_surface_type)
258
+ #define __cudart_builtin__ \
259
+ __location__(cudart_builtin)
260
+ #endif /* !defined(__CUDACC__) */
261
+
262
+ #if defined(__CUDACC__) || !defined(__cluster_dims__)
263
+ #if defined(_MSC_VER)
264
+ #define __cluster_dims__(...) \
265
+ __declspec(__cluster_dims__(__VA_ARGS__))
266
+
267
+ #else /* !defined(_MSC_VER) */
268
+ #define __cluster_dims__(...) \
269
+ __attribute__((cluster_dims(__VA_ARGS__)))
270
+ #endif /* defined(_MSC_VER) */
271
+ #endif /* defined(__CUDACC__) || !defined(__cluster_dims__) */
272
+
273
+ #define __CUDA_ARCH_HAS_FEATURE__(_FEAT) __CUDA_ARCH_FEAT_##_FEAT
274
+
275
+ #endif /* !__HOST_DEFINES_H__ */
276
+
277
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__)
278
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
279
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H__
280
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/host_runtime.h ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * NVIDIA_COPYRIGHT_BEGIN
3
+ *
4
+ * Copyright (c) 2008-2023, NVIDIA CORPORATION. All rights reserved.
5
+ *
6
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
7
+ * and proprietary rights in and to this software, related documentation
8
+ * and any modifications thereto. Any use, reproduction, disclosure or
9
+ * distribution of this software and related documentation without an express
10
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
11
+ *
12
+ * NVIDIA_COPYRIGHT_END
13
+ */
14
+
15
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
16
+ #if defined(_MSC_VER)
17
+ #pragma message("crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
18
+ #else
19
+ #warning "crt/device_functions.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
20
+ #endif
21
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
22
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
23
+ #endif
24
+
25
+ #if !defined(__CUDA_INTERNAL_COMPILATION__)
26
+
27
+ #define __CUDA_INTERNAL_COMPILATION__
28
+ #define __text__
29
+ #define __surf__
30
+ #define __name__shadow_var(c, cpp) \
31
+ #c
32
+ #define __name__text_var(c, cpp) \
33
+ #cpp
34
+ #define __host__shadow_var(c, cpp) \
35
+ cpp
36
+ #define __text_var(c, cpp) \
37
+ cpp
38
+ #define __device_fun(fun) \
39
+ #fun
40
+ #define __device_var(var) \
41
+ #var
42
+ #define __device__text_var(c, cpp) \
43
+ #c
44
+ #define __device__shadow_var(c, cpp) \
45
+ #c
46
+
47
+ #if defined(_WIN32) && !defined(_WIN64)
48
+
49
+ #define __pad__(f) \
50
+ f
51
+
52
+ #else /* _WIN32 && !_WIN64 */
53
+
54
+ #define __pad__(f)
55
+
56
+ #endif /* _WIN32 && !_WIN64 */
57
+
58
+ #include "builtin_types.h"
59
+ #include "storage_class.h"
60
+
61
+ #else /* !__CUDA_INTERNAL_COMPILATION__ */
62
+
63
+ template <typename T>
64
+ static inline T *__cudaAddressOf(T &val)
65
+ {
66
+ return (T *)((void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(val)))));
67
+ }
68
+
69
+ #define __cudaRegisterBinary(X) \
70
+ __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
71
+ { void (*callback_fp)(void **) = (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
72
+ atexit(__cudaUnregisterBinaryUtil)
73
+
74
+ #define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
75
+ __cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
76
+ #define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
77
+ __cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
78
+
79
+ #define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
80
+ __cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
81
+ #define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
82
+ __cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
83
+ #define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
84
+ __cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
85
+
86
+ extern "C" cudaError_t CUDARTAPI __cudaPopCallConfiguration(
87
+ dim3 *gridDim,
88
+ dim3 *blockDim,
89
+ size_t *sharedMem,
90
+ void *stream
91
+ );
92
+
93
+ #define __cudaLaunchPrologue(size) \
94
+ void * __args_arr[size]; \
95
+ int __args_idx = 0
96
+
97
+ #define __cudaSetupArg(arg, offset) \
98
+ __args_arr[__args_idx] = (void *)__cudaAddressOf(arg); ++__args_idx
99
+
100
+ #define __cudaSetupArgSimple(arg, offset) \
101
+ __args_arr[__args_idx] = (void *)(char *)&arg; ++__args_idx
102
+
103
+ #if defined(__GNUC__)
104
+ #define __NV_ATTR_UNUSED_FOR_LAUNCH __attribute__((unused))
105
+ #else /* !__GNUC__ */
106
+ #define __NV_ATTR_UNUSED_FOR_LAUNCH
107
+ #endif /* __GNUC__ */
108
+
109
+ #ifdef __NV_LEGACY_LAUNCH
110
+ /* the use of __args_idx in the expression below avoids host compiler warning about it being an
111
+ unused variable when the launch has no arguments */
112
+ #define __cudaLaunch(fun) \
113
+ { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH; __f = fun; \
114
+ dim3 __gridDim, __blockDim;\
115
+ size_t __sharedMem; \
116
+ cudaStream_t __stream; \
117
+ if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
118
+ return; \
119
+ if (__args_idx == 0) {\
120
+ (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
121
+ } else { \
122
+ (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
123
+ }\
124
+ }
125
+ #else /* !__NV_LEGACY_LAUNCH */
126
+ #define __cudaLaunch(fun) \
127
+ { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH; __f = fun; \
128
+ static cudaKernel_t __handle = 0; \
129
+ volatile static bool __tmp __NV_ATTR_UNUSED_FOR_LAUNCH = (__cudaGetKernel(&__handle, (const void *)fun) == cudaSuccess); \
130
+ dim3 __gridDim, __blockDim;\
131
+ size_t __sharedMem; \
132
+ cudaStream_t __stream; \
133
+ if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
134
+ return; \
135
+ if (__args_idx == 0) {\
136
+ (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
137
+ } else { \
138
+ (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
139
+ }\
140
+ }
141
+ #endif /* __NV_LEGACY_LAUNCH */
142
+
143
+ #if defined(__GNUC__)
144
+ #define __nv_dummy_param_ref(param) \
145
+ { volatile static void **__ref __attribute__((unused)); __ref = (volatile void **)param; }
146
+ #else /* __GNUC__ */
147
+ #define __nv_dummy_param_ref(param) \
148
+ { volatile static void **__ref; __ref = (volatile void **)param; }
149
+ #endif /* __GNUC__ */
150
+
151
+ static void ____nv_dummy_param_ref(void *param) __nv_dummy_param_ref(param)
152
+
153
+ #define __REGISTERFUNCNAME_CORE(X) __cudaRegisterLinkedBinary##X
154
+ #define __REGISTERFUNCNAME(X) __REGISTERFUNCNAME_CORE(X)
155
+
156
+ extern "C" {
157
+ void __REGISTERFUNCNAME( __NV_MODULE_ID ) ( void (*)(void **), void *, void *, void (*)(void *));
158
+ }
159
+
160
+ #define __TO_STRING_CORE(X) #X
161
+ #define __TO_STRING(X) __TO_STRING_CORE(X)
162
+
163
+ extern "C" {
164
+ #if defined(_WIN32)
165
+ #pragma data_seg("__nv_module_id")
166
+ static const __declspec(allocate("__nv_module_id")) unsigned char __module_id_str[] = __TO_STRING(__NV_MODULE_ID);
167
+ #pragma data_seg()
168
+ #elif defined(__APPLE__)
169
+ static const unsigned char __module_id_str[] __attribute__((section ("__NV_CUDA,__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
170
+ #else
171
+ static const unsigned char __module_id_str[] __attribute__((section ("__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
172
+ #endif
173
+
174
+ #undef __FATIDNAME_CORE
175
+ #undef __FATIDNAME
176
+ #define __FATIDNAME_CORE(X) __fatbinwrap##X
177
+ #define __FATIDNAME(X) __FATIDNAME_CORE(X)
178
+
179
+ #define ____cudaRegisterLinkedBinary(X) \
180
+ { __REGISTERFUNCNAME(__NV_MODULE_ID) (( void (*)(void **))(X), (void *)&__FATIDNAME(__NV_MODULE_ID), (void *)&__module_id_str, (void (*)(void *))&____nv_dummy_param_ref); }
181
+
182
+ }
183
+
184
+ extern "C" {
185
+ extern void** CUDARTAPI __cudaRegisterFatBinary(
186
+ void *fatCubin
187
+ );
188
+
189
+ extern void CUDARTAPI __cudaRegisterFatBinaryEnd(
190
+ void **fatCubinHandle
191
+ );
192
+
193
+ extern void CUDARTAPI __cudaUnregisterFatBinary(
194
+ void **fatCubinHandle
195
+ );
196
+
197
+ extern void CUDARTAPI __cudaRegisterVar(
198
+ void **fatCubinHandle,
199
+ char *hostVar,
200
+ char *deviceAddress,
201
+ const char *deviceName,
202
+ int ext,
203
+ size_t size,
204
+ int constant,
205
+ int global
206
+ );
207
+
208
+ extern void CUDARTAPI __cudaRegisterManagedVar(
209
+ void **fatCubinHandle,
210
+ void **hostVarPtrAddress,
211
+ char *deviceAddress,
212
+ const char *deviceName,
213
+ int ext,
214
+ size_t size,
215
+ int constant,
216
+ int global
217
+ );
218
+
219
+ extern char CUDARTAPI __cudaInitModule(
220
+ void **fatCubinHandle
221
+ );
222
+
223
+ extern void CUDARTAPI __cudaRegisterTexture(
224
+ void **fatCubinHandle,
225
+ const struct textureReference *hostVar,
226
+ const void **deviceAddress,
227
+ const char *deviceName,
228
+ int dim,
229
+ int norm,
230
+ int ext
231
+ );
232
+
233
+ extern void CUDARTAPI __cudaRegisterSurface(
234
+ void **fatCubinHandle,
235
+ const struct surfaceReference *hostVar,
236
+ const void **deviceAddress,
237
+ const char *deviceName,
238
+ int dim,
239
+ int ext
240
+ );
241
+
242
+ extern void CUDARTAPI __cudaRegisterFunction(
243
+ void **fatCubinHandle,
244
+ const char *hostFun,
245
+ char *deviceFun,
246
+ const char *deviceName,
247
+ int thread_limit,
248
+ uint3 *tid,
249
+ uint3 *bid,
250
+ dim3 *bDim,
251
+ dim3 *gDim,
252
+ int *wSize
253
+ );
254
+
255
+ #if defined(__APPLE__)
256
+ extern "C" int atexit(void (*)(void));
257
+
258
+ #elif defined(__GNUC__) && !defined(__ANDROID__) && !defined(__HORIZON__)
259
+ extern int atexit(void(*)(void)) throw();
260
+
261
+ #elif defined(__HORIZON__)
262
+
263
+ // __TEMP_WAR__ 200132570 HOS : Disable atexit call until it works
264
+ #define atexit(p)
265
+
266
+ #else /* __GNUC__ && !__ANDROID__ */
267
+ extern int __cdecl atexit(void(__cdecl *)(void));
268
+ #endif
269
+
270
+ }
271
+
272
+ static void **__cudaFatCubinHandle;
273
+
274
+ static void __cdecl __cudaUnregisterBinaryUtil(void)
275
+ {
276
+ ____nv_dummy_param_ref((void *)&__cudaFatCubinHandle);
277
+ __cudaUnregisterFatBinary(__cudaFatCubinHandle);
278
+ }
279
+
280
+ static char __nv_init_managed_rt_with_module(void **handle)
281
+ {
282
+ return __cudaInitModule(handle);
283
+ }
284
+
285
+ #include "common_functions.h"
286
+
287
+ #pragma pack()
288
+
289
+ #if defined(_WIN32)
290
+
291
+ #pragma warning(disable: 4099)
292
+
293
+ #if !defined(_WIN64)
294
+
295
+ #pragma warning(disable: 4408)
296
+
297
+ #endif /* !_WIN64 */
298
+
299
+ #endif /* _WIN32 */
300
+
301
+ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
302
+
303
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__)
304
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
305
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
306
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.h ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.h ADDED
@@ -0,0 +1,754 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2020 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/mma.h is an internal header file and must not be used directly. Please use mma.h instead.")
53
+ #else
54
+ #warning "crt/mma.h is an internal header file and must not be used directly. Please use mma.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
58
+ #endif
59
+
60
+ #if !defined(__CUDA_MMA_H__)
61
+ #define __CUDA_MMA_H__
62
+
63
+ #include <cuda_fp16.h>
64
+ #include <cuda_bf16.h>
65
+
66
+ #define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
67
+
68
+ #if defined(__cplusplus) && defined(__CUDACC__)
69
+
70
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
71
+
72
+
73
+ #if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
74
+ #define __DEF_IF_HOST { }
75
+ #else /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
76
+ #define __DEF_IF_HOST ;
77
+ #endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
78
+
79
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
80
+ #define __CUDA_IMMA__ 1
81
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
82
+
83
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
84
+ #define __CUDA_SUBBYTE_IMMA__ 1
85
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
86
+
87
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
88
+ #define __CUDA_AMPERE_MMA__ 1
89
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
90
+
91
+ namespace nvcuda {
92
+ namespace wmma {
93
+
94
+ // utility functions
95
+ #ifdef __CUDA_AMPERE_MMA__
96
+ inline __device__ float __float_to_tf32(float in)
97
+ {
98
+ float ret;
99
+ asm("{\n .reg .b32 __$1;"
100
+ "\n cvt.rna.tf32.f32 __$1, %1;"
101
+ "\n mov.b32 %0, __$1;\n}\n" : "=f"(ret) : "f"(in) );
102
+ return ret;
103
+ }
104
+ #endif /* __CUDA_AMPERE_MMA__ */
105
+
106
+ //
107
+ // tags
108
+ //
109
+ struct row_major;
110
+ struct col_major;
111
+ struct matrix_a;
112
+ struct matrix_b;
113
+ struct accumulator;
114
+
115
+ #ifdef __CUDA_AMPERE_MMA__
116
+ namespace precision {
117
+ struct tf32;
118
+ }
119
+ #endif /* __CUDA_AMPERE_MMA__ */
120
+ #ifdef __CUDA_SUBBYTE_IMMA__
121
+ namespace experimental {
122
+ namespace precision {
123
+ struct u4; // 4-bit unsigned
124
+ struct s4; // 4-bit signed
125
+ struct b1; // 1-bit
126
+ }
127
+ enum bmmaBitOp { bmmaBitOpXOR = 1
128
+ #ifdef __CUDA_AMPERE_MMA__
129
+ , bmmaBitOpAND = 2
130
+ #endif /* __CUDA_AMPERE_MMA__ */
131
+ };
132
+ enum bmmaAccumulateOp { bmmaAccumulateOpPOPC = 1 };
133
+ }
134
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
135
+
136
+ //
137
+ // layout
138
+ //
139
+ enum layout_t {
140
+ mem_row_major, mem_col_major
141
+ };
142
+
143
+ template <typename T>
144
+ struct helper_traits {
145
+ typedef T element_type;
146
+ typedef T storage_element_type;
147
+ typedef T fill_argument_type;
148
+ };
149
+
150
+ #ifdef __CUDA_SUBBYTE_IMMA__
151
+ template<> struct helper_traits<experimental::precision::u4> {
152
+ typedef experimental::precision::u4 element_type;
153
+ typedef unsigned int storage_element_type;
154
+ typedef unsigned int fill_argument_type;
155
+ };
156
+
157
+ template<> struct helper_traits<experimental::precision::s4> {
158
+ typedef experimental::precision::s4 element_type;
159
+ typedef int storage_element_type;
160
+ typedef int fill_argument_type;
161
+ };
162
+
163
+ template<> struct helper_traits<experimental::precision::b1> {
164
+ typedef experimental::precision::b1 element_type;
165
+ typedef unsigned int storage_element_type;
166
+ typedef unsigned int fill_argument_type;
167
+ };
168
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
169
+
170
+ #ifdef __CUDA_AMPERE_MMA__
171
+ template<> struct helper_traits<precision::tf32> {
172
+ typedef precision::tf32 element_type;
173
+ typedef float storage_element_type;
174
+ typedef float fill_argument_type;
175
+ };
176
+ #endif /* __CUDA_AMPERE_MMA__ */
177
+
178
+ //
179
+ // The base fragment type
180
+ //
181
+ /* note: alignment required for compiler implementation */
182
+ template <typename T, int size, int packed_size = size>
183
+ struct __align__(8) __frag_base {
184
+
185
+ /* Number of elements in the fragment */
186
+ enum {num_elements = size};
187
+
188
+ /* Number of storage elements in the fragment.
189
+
190
+ The elements of the fragment are packed together when the
191
+ fragment element type is experimental::precision::u4,
192
+ experimental::precision::s4 or experimental::precision::b1.
193
+ When elements are packed, num_storage_elements
194
+ will be smaller than num_elements.
195
+ */
196
+ enum {num_storage_elements = packed_size};
197
+
198
+ /* element type of the fragment */
199
+ typedef T element_type;
200
+
201
+ /* element type of the storage representation.
202
+
203
+ The mapping from element_type to storage_element_type is as follows:
204
+ experimental::precision::u4 -> unsigned (8 elements in 1 storage element)
205
+ experimental::precision::s4 -> int (8 elements in 1 storage element)
206
+ experimental::precision::b1 -> unsigned (32 elements in 1 storage element)
207
+ precision::tf32 -> float (1 element in 1 storage element)
208
+ all other types T -> T
209
+ */
210
+ typedef typename helper_traits<T>::storage_element_type storage_element_type;
211
+
212
+ /* Storage for the (possibly packed) fragment elements. */
213
+ storage_element_type x[num_storage_elements];
214
+ };
215
+
216
+ template <typename FragEleType, typename StorageType, typename ArgType>
217
+ static inline __device__ StorageType __get_storage_value(ArgType in) { return in; }
218
+
219
+ #ifdef __CUDA_SUBBYTE_IMMA__
220
+ template<>
221
+ __device__ inline unsigned
222
+ __get_storage_value<experimental::precision::u4, unsigned, unsigned>(unsigned in)
223
+ {
224
+ /* For experimental::precision::u4 fragment element type, pack 8 elements into a single
225
+ 32-bit unsigned int storage element */
226
+ unsigned val = in & 0xf;
227
+ return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
228
+ (val << 20) | (val << 24) | (val << 28));
229
+ };
230
+
231
+ template<>
232
+ __device__ inline int
233
+ __get_storage_value<experimental::precision::s4, int, int>(int in)
234
+ {
235
+ /* For experimental::precision::s4 fragment element type, pack 8 elements into a single
236
+ 32-bit signed int storage element */
237
+ int val = in & 0xf;
238
+ return (val | (val << 4) | (val << 8) | (val << 12) | (val << 16) |
239
+ (val << 20) | (val << 24) | (val << 28));
240
+ };
241
+
242
+ template<>
243
+ __device__ inline unsigned
244
+ __get_storage_value<experimental::precision::b1, unsigned, unsigned>(unsigned in)
245
+ {
246
+ /* For experimental::precision::b1 fragment element type, pack 32 elements into a
247
+ single 32-bit unsigned int storage element */
248
+ return (in & 0x1) ? 0xFFFFFFFFU : 0;
249
+ }
250
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
251
+
252
+ template <typename FragEleType, int size, int packed_size>
253
+ __CUDA_MMA_DEVICE_DECL__ void fill_fragment(__frag_base<FragEleType, size, packed_size>& f,
254
+ /* The mapping from fragment element type (FragEleType) to fill_argument_type is:
255
+ experimental::precision::u4 -> unsigned (only lower 4 bits taken)
256
+ experimental::precision::s4 -> int (only lower 4 bits taken)
257
+ experimental::precision::b1 -> unsigned (only lowest 1 bit taken)
258
+ precision::tf32 -> float
259
+ all other types T -> T
260
+ */
261
+ const typename helper_traits<FragEleType>::fill_argument_type & in) {
262
+
263
+ /* get the (possibly packed) storage element value. See the specializations above for fragment
264
+ element types where the storage representation is packed */
265
+ typedef typename helper_traits<FragEleType>::storage_element_type storage_type;
266
+ storage_type v = __get_storage_value<FragEleType, storage_type>(in);
267
+ #pragma unroll
268
+ for (int i=0; i< f.num_storage_elements; i++)
269
+ f.x[i] = v;
270
+ }
271
+
272
+ //
273
+ // Fragment template
274
+ //
275
+ template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;
276
+
277
+ //
278
+ // Fragments for 16x16x16
279
+ //
280
+ template<> class fragment<matrix_a, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
281
+ template<> class fragment<matrix_a, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
282
+ template<> class fragment<matrix_b, 16, 16, 16, __half, row_major> : public __frag_base<__half, 16> {};
283
+ template<> class fragment<matrix_b, 16, 16, 16, __half, col_major> : public __frag_base<__half, 16> {};
284
+ template<> class fragment<accumulator, 16, 16, 16, __half> : public __frag_base<__half, 8> {};
285
+ template<> class fragment<accumulator, 16, 16, 16, float> : public __frag_base<float, 8> {};
286
+
287
+ #ifdef __CUDA_IMMA__
288
+ template<> class fragment<matrix_a, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
289
+ template<> class fragment<matrix_a, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
290
+ template<> class fragment<matrix_a, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
291
+ template<> class fragment<matrix_a, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
292
+ template<> class fragment<matrix_b, 16, 16, 16, signed char, row_major> : public __frag_base<signed char, 8> {};
293
+ template<> class fragment<matrix_b, 16, 16, 16, signed char, col_major> : public __frag_base<signed char, 8> {};
294
+ template<> class fragment<matrix_b, 16, 16, 16, unsigned char, row_major> : public __frag_base<unsigned char, 8> {};
295
+ template<> class fragment<matrix_b, 16, 16, 16, unsigned char, col_major> : public __frag_base<unsigned char, 8> {};
296
+ template<> class fragment<accumulator, 16, 16, 16, int> : public __frag_base<int, 8> {};
297
+ #endif /* __CUDA_IMMA__ */
298
+
299
+ #ifdef __CUDA_AMPERE_MMA__
300
+ template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
301
+ template<> class fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
302
+ template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 8> {};
303
+ template<> class fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 8> {};
304
+ #endif /* __CUDA_AMPERE_MMA__ */
305
+
306
+ //
307
+ // Fragments for 32x8x16
308
+ //
309
+ template<> class fragment<matrix_a, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
310
+ template<> class fragment<matrix_a, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
311
+ template<> class fragment<matrix_b, 32, 8, 16, __half, row_major> : public __frag_base<__half, 16> {};
312
+ template<> class fragment<matrix_b, 32, 8, 16, __half, col_major> : public __frag_base<__half, 16> {};
313
+ template<> class fragment<accumulator, 32, 8, 16, __half> : public __frag_base<__half, 8> {};
314
+ template<> class fragment<accumulator, 32, 8, 16, float> : public __frag_base<float, 8> {};
315
+
316
+ #ifdef __CUDA_IMMA__
317
+ template<> class fragment<matrix_a, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
318
+ template<> class fragment<matrix_a, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
319
+ template<> class fragment<matrix_a, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
320
+ template<> class fragment<matrix_a, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
321
+ template<> class fragment<matrix_b, 32, 8, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
322
+ template<> class fragment<matrix_b, 32, 8, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
323
+ template<> class fragment<matrix_b, 32, 8, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
324
+ template<> class fragment<matrix_b, 32, 8, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
325
+ template<> class fragment<accumulator, 32, 8, 16, int> : public __frag_base<int, 8> {};
326
+ #endif /* __CUDA_IMMA__ */
327
+
328
+ #ifdef __CUDA_AMPERE_MMA__
329
+ template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
330
+ template<> class fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
331
+ template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
332
+ template<> class fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
333
+ #endif /* __CUDA_AMPERE_MMA__ */
334
+
335
+ //
336
+ // Fragments for 8x32x16
337
+ //
338
+ template<> class fragment<matrix_a, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
339
+ template<> class fragment<matrix_a, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
340
+ template<> class fragment<matrix_b, 8, 32, 16, __half, row_major> : public __frag_base<__half, 16> {};
341
+ template<> class fragment<matrix_b, 8, 32, 16, __half, col_major> : public __frag_base<__half, 16> {};
342
+ template<> class fragment<accumulator, 8, 32, 16, __half> : public __frag_base<__half, 8> {};
343
+ template<> class fragment<accumulator, 8, 32, 16, float> : public __frag_base<float, 8> {};
344
+
345
+ #ifdef __CUDA_IMMA__
346
+ template<> class fragment<matrix_a, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 4> {};
347
+ template<> class fragment<matrix_a, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 4> {};
348
+ template<> class fragment<matrix_a, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 4> {};
349
+ template<> class fragment<matrix_a, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 4> {};
350
+ template<> class fragment<matrix_b, 8, 32, 16, signed char, row_major> : public __frag_base<signed char, 16> {};
351
+ template<> class fragment<matrix_b, 8, 32, 16, signed char, col_major> : public __frag_base<signed char, 16> {};
352
+ template<> class fragment<matrix_b, 8, 32, 16, unsigned char, row_major> : public __frag_base<unsigned char, 16> {};
353
+ template<> class fragment<matrix_b, 8, 32, 16, unsigned char, col_major> : public __frag_base<unsigned char, 16> {};
354
+ template<> class fragment<accumulator, 8, 32, 16, int> : public __frag_base<int, 8> {};
355
+ #endif /* __CUDA_IMMA__ */
356
+
357
+ #ifdef __CUDA_AMPERE_MMA__
358
+ template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 4> {};
359
+ template<> class fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 4> {};
360
+ template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major> : public __frag_base<__nv_bfloat16, 16> {};
361
+ template<> class fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major> : public __frag_base<__nv_bfloat16, 16> {};
362
+ #endif /* __CUDA_AMPERE_MMA__ */
363
+
364
+ #ifdef __CUDA_SUBBYTE_IMMA__
365
+ //
366
+ // Fragments for 8x8x32
367
+ //
368
+ template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
369
+ template<> class fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
370
+ template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major> : public __frag_base<experimental::precision::u4, 8, 1> {};
371
+ template<> class fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> : public __frag_base<experimental::precision::s4, 8, 1> {};
372
+ template<> class fragment<accumulator, 8, 8, 32, int> : public __frag_base<int, 2> {};
373
+
374
+ //
375
+ // Fragments for 8x8x128
376
+ //
377
+ template<> class fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
378
+ template<> class fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major> : public __frag_base<experimental::precision::b1, 32, 1> {};
379
+ template<> class fragment<accumulator, 8, 8, 128, int> : public __frag_base<int, 2> {};
380
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
381
+
382
+ #ifdef __CUDA_AMPERE_MMA__
383
+ //
384
+ // Fragments for 16x16x8
385
+ //
386
+ template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
387
+ template<> class fragment<matrix_a, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
388
+ template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, row_major> : public __frag_base<precision::tf32, 4> {};
389
+ template<> class fragment<matrix_b, 16, 16, 8, precision::tf32, col_major> : public __frag_base<precision::tf32, 4> {};
390
+ template<> class fragment<accumulator, 16, 16, 8, float> : public __frag_base<float, 8> {};
391
+
392
+ //
393
+ // Fragments for 8x8x4
394
+ //
395
+ template<> class fragment<matrix_a, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
396
+ template<> class fragment<matrix_a, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
397
+ template<> class fragment<matrix_b, 8, 8, 4, double, row_major> : public __frag_base<double, 1> {};
398
+ template<> class fragment<matrix_b, 8, 8, 4, double, col_major> : public __frag_base<double, 1> {};
399
+ template<> class fragment<accumulator, 8, 8, 4, double> : public __frag_base<double, 2> {};
400
+ #endif /* __CUDA_AMPERE_MMA__ */
401
+
402
+
403
+ //
404
+ // Load functions for frags of shape m16n16k16
405
+ //
406
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
407
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
408
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
409
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
410
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
411
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
412
+
413
+ #ifdef __CUDA_IMMA__
414
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
415
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
416
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
417
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
418
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
419
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
420
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
421
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
422
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
423
+ #endif /* __CUDA_IMMA__ */
424
+
425
+ #ifdef __CUDA_AMPERE_MMA__
426
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
427
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
428
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
429
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
430
+ #endif /* __CUDA_AMPERE_MMA__ */
431
+
432
+ //
433
+ // Load functions for frags of shape m32n8k16
434
+ //
435
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
436
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
437
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
438
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
439
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
440
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
441
+
442
+ #ifdef __CUDA_IMMA__
443
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
444
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
445
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
446
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
447
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
448
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
449
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
450
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
451
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
452
+ #endif /* __CUDA_IMMA__ */
453
+
454
+ #ifdef __CUDA_AMPERE_MMA__
455
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
456
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
457
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
458
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
459
+ #endif /* __CUDA_AMPERE_MMA__ */
460
+
461
+ //
462
+ // Load functions for frags of shape m8n32k16
463
+ //
464
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
465
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
466
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
467
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) __DEF_IF_HOST
468
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
469
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
470
+
471
+ #ifdef __CUDA_IMMA__
472
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
473
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
474
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
475
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
476
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
477
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) __DEF_IF_HOST
478
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
479
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) __DEF_IF_HOST
480
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
481
+ #endif /* __CUDA_IMMA__ */
482
+
483
+ #ifdef __CUDA_AMPERE_MMA__
484
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
485
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
486
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
487
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) __DEF_IF_HOST
488
+ #endif /* __CUDA_AMPERE_MMA__ */
489
+
490
+ #ifdef __CUDA_SUBBYTE_IMMA__
491
+ //
492
+ // Load functions for frags of shape m8n8k32
493
+ //
494
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
495
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
496
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
497
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
498
+
499
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
500
+
501
+ //
502
+ // Load functions for frags of shape m8n8k128
503
+ //
504
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
505
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) __DEF_IF_HOST
506
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
507
+
508
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
509
+
510
+
511
+ #ifdef __CUDA_AMPERE_MMA__
512
+ //
513
+ // Load functions for frags of shape m16n16k8
514
+ //
515
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
516
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
517
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
518
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) __DEF_IF_HOST
519
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
520
+
521
+ //
522
+ // Load functions for frags of shape m8n8k4
523
+ //
524
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
525
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
526
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
527
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) __DEF_IF_HOST
528
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) __DEF_IF_HOST
529
+ #endif /* __CUDA_AMPERE_MMA__ */
530
+
531
+ //
532
+ // Store functions for frags of shape m16n16k16
533
+ //
534
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 16, 16, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
535
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
536
+ #ifdef __CUDA_IMMA__
537
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 16, 16, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
538
+ #endif /* __CUDA_IMMA__ */
539
+
540
+ //
541
+ // Store functions for frags of shape m32n8k16
542
+ //
543
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
544
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
545
+ #ifdef __CUDA_IMMA__
546
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
547
+ #endif /* __CUDA_IMMA__ */
548
+
549
+ //
550
+ // Store functions for frags of shape m8n32k16
551
+ //
552
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
553
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
554
+ #ifdef __CUDA_IMMA__
555
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
556
+ #endif /* __CUDA_IMMA__ */
557
+
558
+ #ifdef __CUDA_SUBBYTE_IMMA__
559
+ //
560
+ // Store functions for frags of shape m8n8k32
561
+ //
562
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
563
+
564
+ //
565
+ // Store functions for frags of shape m8n8k128
566
+ //
567
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
568
+
569
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
570
+
571
+ #ifdef __CUDA_AMPERE_MMA__
572
+ //
573
+ // Store functions for frags of shape m16n16k8
574
+ //
575
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
576
+
577
+ //
578
+ // Store functions for frags of shape m8n8k4
579
+ //
580
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) __DEF_IF_HOST
581
+ #endif /* __CUDA_AMPERE_MMA__ */
582
+
583
+ //
584
+ // MMA functions for shape m16n16k16
585
+ //
586
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
587
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
588
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
589
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
590
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
591
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
592
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
593
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) __DEF_IF_HOST
594
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
595
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
596
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
597
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
598
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
599
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
600
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
601
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
602
+
603
+ #ifdef __CUDA_IMMA__
604
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
605
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
606
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
607
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
608
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
609
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
610
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
611
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf=false) __DEF_IF_HOST
612
+ #endif /* __CUDA_IMMA__ */
613
+
614
+ #ifdef __CUDA_AMPERE_MMA__
615
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
616
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
617
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
618
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) __DEF_IF_HOST
619
+ #endif /* __CUDA_AMPERE_MMA__ */
620
+
621
+ //
622
+ // MMA functions for shape m32n8k16
623
+ //
624
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
625
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
626
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
627
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
628
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
629
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
630
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
631
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, __half>& c) __DEF_IF_HOST
632
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
633
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
634
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
635
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
636
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
637
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
638
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
639
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b, 32, 8, 16, __half, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
640
+
641
+ #ifdef __CUDA_IMMA__
642
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
643
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
644
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
645
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
646
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
647
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
648
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
649
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf=false) __DEF_IF_HOST
650
+ #endif /* __CUDA_IMMA__ */
651
+
652
+ #ifdef __CUDA_AMPERE_MMA__
653
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
654
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
655
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
656
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) __DEF_IF_HOST
657
+ #endif /* __CUDA_AMPERE_MMA__ */
658
+
659
+ //
660
+ // MMA functions for shape m8n32k16
661
+ //
662
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
663
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
664
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
665
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
666
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
667
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
668
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
669
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, __half>& c) __DEF_IF_HOST
670
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
671
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
672
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
673
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
674
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
675
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
676
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
677
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b, 8, 32, 16, __half, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
678
+
679
+ #ifdef __CUDA_IMMA__
680
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
681
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
682
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
683
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
684
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
685
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
686
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
687
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf=false) __DEF_IF_HOST
688
+ #endif /* __CUDA_IMMA__ */
689
+
690
+ #ifdef __CUDA_AMPERE_MMA__
691
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
692
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
693
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
694
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) __DEF_IF_HOST
695
+ #endif /* __CUDA_AMPERE_MMA__ */
696
+
697
+ #ifdef __CUDA_SUBBYTE_IMMA__
698
+ //
699
+ // MMA functions for shape m8n8k32
700
+ //
701
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
702
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf=false) __DEF_IF_HOST
703
+
704
+
705
+ //
706
+ // MMA functions for shape m8n8k128
707
+ //
708
+ __CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
709
+ experimental::bmmaBitOp = experimental::bmmaBitOpXOR,
710
+ experimental::bmmaAccumulateOp = experimental::bmmaAccumulateOpPOPC) __DEF_IF_HOST
711
+
712
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
713
+
714
+ #ifdef __CUDA_AMPERE_MMA__
715
+ //
716
+ // MMA functions for shape m16n16k8
717
+ //
718
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
719
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
720
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
721
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) __DEF_IF_HOST
722
+
723
+ //
724
+ // MMA functions for shape m8n8k4
725
+ //
726
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
727
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
728
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
729
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) __DEF_IF_HOST
730
+ #endif /* __CUDA_AMPERE_MMA__ */
731
+ };
732
+ };
733
+
734
+ #undef __DEF_IF_HOST
735
+ #undef __CUDA_IMMA__
736
+ #undef __CUDA_SUBBYTE_IMMA__
737
+ #undef __CUDA_AMPERE_MMA__
738
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
739
+
740
+ #endif /* __cplusplus && __CUDACC__ */
741
+
742
+ #undef __CUDA_MMA_DEVICE_DECL__
743
+
744
+ #if defined(__CUDA_ARCH__)
745
+ #include "mma.hpp"
746
+ #endif /* defined(__CUDA_ARCH__) */
747
+
748
+
749
+ #endif /* !__CUDA_MMA_H__ */
750
+
751
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__)
752
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
753
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H__
754
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/mma.hpp ADDED
@@ -0,0 +1,1128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2020 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/mma.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/mma.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
58
+ #endif
59
+
60
+ #if !defined(__CUDA_MMA_HPP__)
61
+ #define __CUDA_MMA_HPP__
62
+
63
+ #if defined(__cplusplus) && defined(__CUDACC__)
64
+
65
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
66
+
67
+ #include <cuda_fp16.h>
68
+ #include <cuda_bf16.h>
69
+
70
+ #define __CUDA_MMA_DEVICE_DECL__ static __device__ __inline__
71
+
72
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720
73
+ #define __CUDA_IMMA__ 1
74
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 720 */
75
+
76
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730
77
+ #define __CUDA_SUBBYTE_IMMA__ 1
78
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 730 */
79
+
80
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
81
+ #define __CUDA_AMPERE_MMA__ 1
82
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 */
83
+
84
+ namespace nvcuda {
85
+ namespace wmma {
86
+
87
+ //
88
+ // Load functions for frags of shape m16n16k16
89
+ //
90
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
91
+ __hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
92
+ }
93
+
94
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
95
+ __hmma_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
96
+ }
97
+
98
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
99
+ __hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
100
+ }
101
+
102
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b,16, 16, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
103
+ __hmma_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
104
+ }
105
+
106
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
107
+ if (layout == mem_row_major)
108
+ __hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
109
+ else
110
+ __hmma_m16n16k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
111
+ }
112
+
113
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
114
+ if (layout == mem_row_major)
115
+ __hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
116
+ else
117
+ __hmma_m16n16k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
118
+ }
119
+
120
+ #ifdef __CUDA_IMMA__
121
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
122
+ __imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
123
+ }
124
+
125
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
126
+ __imma_m16n16k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
127
+ }
128
+
129
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
130
+ __imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
131
+ }
132
+
133
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
134
+ __imma_m16n16k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
135
+ }
136
+
137
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
138
+ __imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
139
+ }
140
+
141
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
142
+ __imma_m16n16k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
143
+ }
144
+
145
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
146
+ __imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
147
+ }
148
+
149
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
150
+ __imma_m16n16k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
151
+ }
152
+
153
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator,16, 16, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
154
+ if (layout == mem_row_major)
155
+ __imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 0);
156
+ else
157
+ __imma_m16n16k16_ld_c((int *)&a, (const int*)p, ldm, 1);
158
+ }
159
+ #endif /* __CUDA_IMMA__ */
160
+
161
+ #ifdef __CUDA_AMPERE_MMA__
162
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
163
+ __mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 0);
164
+ }
165
+
166
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
167
+ __mma_bf16_m16n16k16_ld_a((int*)&a, (const int*)p, ldm, 1);
168
+ }
169
+
170
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
171
+ __mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 0);
172
+ }
173
+
174
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
175
+ __mma_bf16_m16n16k16_ld_b((int*)&a, (const int*)p, ldm, 1);
176
+ }
177
+ #endif /* __CUDA_AMPERE_MMA__ */
178
+
179
+
180
+ //
181
+ // Load functions for frags of shape m32n8k16
182
+ //
183
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
184
+ __hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
185
+ }
186
+
187
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
188
+ __hmma_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
189
+ }
190
+
191
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
192
+ __hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
193
+ }
194
+
195
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
196
+ __hmma_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
197
+ }
198
+
199
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
200
+ if (layout == mem_row_major)
201
+ __hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
202
+ else
203
+ __hmma_m32n8k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
204
+ }
205
+
206
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
207
+ if (layout == mem_row_major)
208
+ __hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
209
+ else
210
+ __hmma_m32n8k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
211
+ }
212
+
213
+ #ifdef __CUDA_IMMA__
214
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
215
+ __imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
216
+ }
217
+
218
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
219
+ __imma_m32n8k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
220
+ }
221
+
222
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
223
+ __imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
224
+ }
225
+
226
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
227
+ __imma_m32n8k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
228
+ }
229
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
230
+ __imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
231
+ }
232
+
233
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
234
+ __imma_m32n8k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
235
+ }
236
+
237
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
238
+ __imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
239
+ }
240
+
241
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
242
+ __imma_m32n8k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
243
+ }
244
+
245
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 32, 8, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
246
+ if (layout == mem_row_major)
247
+ __imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 0);
248
+ else
249
+ __imma_m32n8k16_ld_c((int *)&a, (const int*)p, ldm, 1);
250
+ }
251
+ #endif /* __CUDA_IMMA__ */
252
+
253
+ #ifdef __CUDA_AMPERE_MMA__
254
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
255
+ __mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 0);
256
+ }
257
+
258
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
259
+ __mma_bf16_m32n8k16_ld_a((int*)&a, (const int*)p, ldm, 1);
260
+ }
261
+
262
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
263
+ __mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 0);
264
+ }
265
+
266
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
267
+ __mma_bf16_m32n8k16_ld_b((int*)&a, (const int*)p, ldm, 1);
268
+ }
269
+ #endif /* __CUDA_AMPERE_MMA__ */
270
+
271
+
272
+ //
273
+ // Load functions for frags of shape m8n32k16
274
+ //
275
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
276
+ __hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
277
+ }
278
+
279
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
280
+ __hmma_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
281
+ }
282
+
283
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, row_major>& a, const __half* p, unsigned ldm) {
284
+ __hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
285
+ }
286
+
287
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __half, col_major>& a, const __half* p, unsigned ldm) {
288
+ __hmma_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
289
+ }
290
+
291
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, __half>& a, const __half* p, unsigned ldm, layout_t layout) {
292
+ if (layout == mem_row_major)
293
+ __hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 0);
294
+ else
295
+ __hmma_m8n32k16_ld_c_f16((int*)&a, (const int*)p, ldm, 1);
296
+ }
297
+
298
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, float>& a, const float* p, unsigned ldm, layout_t layout) {
299
+ if (layout == mem_row_major)
300
+ __hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 0);
301
+ else
302
+ __hmma_m8n32k16_ld_c_f32((float*)&a, (const float*)p, ldm, 1);
303
+ }
304
+
305
+ #ifdef __CUDA_IMMA__
306
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
307
+ __imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 0);
308
+ }
309
+
310
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
311
+ __imma_m8n32k16_ld_a_s8((int *)&a, (const int *)p, ldm, 1);
312
+ }
313
+
314
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
315
+ __imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 0);
316
+ }
317
+
318
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
319
+ __imma_m8n32k16_ld_a_u8((int *)&a, (const int *)p, ldm, 1);
320
+ }
321
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, row_major>& a, const signed char* p, unsigned ldm) {
322
+ __imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 0);
323
+ }
324
+
325
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, signed char, col_major>& a, const signed char* p, unsigned ldm) {
326
+ __imma_m8n32k16_ld_b_s8((int *)&a, (const int *)p, ldm, 1);
327
+ }
328
+
329
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& a, const unsigned char* p, unsigned ldm) {
330
+ __imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 0);
331
+ }
332
+
333
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& a, const unsigned char* p, unsigned ldm) {
334
+ __imma_m8n32k16_ld_b_u8((int *)&a, (const int *)p, ldm, 1);
335
+ }
336
+
337
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 32, 16, int>& a, const int* p, unsigned ldm, layout_t layout) {
338
+ if (layout == mem_row_major)
339
+ __imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 0);
340
+ else
341
+ __imma_m8n32k16_ld_c((int *)&a, (const int*)p, ldm, 1);
342
+ }
343
+ #endif /* __CUDA_IMMA__ */
344
+
345
+ #ifdef __CUDA_AMPERE_MMA__
346
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
347
+ __mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 0);
348
+ }
349
+
350
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
351
+ __mma_bf16_m8n32k16_ld_a((int*)&a, (const int*)p, ldm, 1);
352
+ }
353
+
354
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& a, const __nv_bfloat16* p, unsigned ldm) {
355
+ __mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 0);
356
+ }
357
+
358
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& a, const __nv_bfloat16* p, unsigned ldm) {
359
+ __mma_bf16_m8n32k16_ld_b((int*)&a, (const int*)p, ldm, 1);
360
+ }
361
+ #endif /* __CUDA_AMPERE_MMA__ */
362
+
363
+
364
+ #ifdef __CUDA_SUBBYTE_IMMA__
365
+ //
366
+ // Load functions for frags of shape m8n8k32
367
+ //
368
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const void* p, unsigned ldm) {
369
+ __imma_m8n8k32_ld_a_s4((int *)&a, (const int *)p, ldm, 0);
370
+ }
371
+
372
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const void* p, unsigned ldm) {
373
+ __imma_m8n8k32_ld_a_u4((int *)&a, (const int *)p, ldm, 0);
374
+ }
375
+
376
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& a, const void* p, unsigned ldm) {
377
+ __imma_m8n8k32_ld_b_s4((int *)&a, (const int *)p, ldm, 1);
378
+ }
379
+
380
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& a, const void* p, unsigned ldm) {
381
+ __imma_m8n8k32_ld_b_u4((int *)&a, (const int *)p, ldm, 1);
382
+ }
383
+
384
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 32, int>& a, const int* p, unsigned ldm, layout_t layout) {
385
+ if (layout == mem_row_major)
386
+ __imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 0);
387
+ else
388
+ __imma_m8n8k32_ld_c((int *)&a, (const int*)p, ldm, 1);
389
+ }
390
+
391
+ //
392
+ // Load functions for frags of shape m8n8k128
393
+ //
394
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const void* p, unsigned ldm) {
395
+ __bmma_m8n8k128_ld_a_b1((int *)&a, (const int *)p, ldm, 0);
396
+ }
397
+
398
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& a, const void* p, unsigned ldm) {
399
+ __bmma_m8n8k128_ld_b_b1((int *)&a, (const int *)p, ldm, 1);
400
+ }
401
+
402
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 128, int>& a, const int* p, unsigned ldm, layout_t layout) {
403
+ if (layout == mem_row_major)
404
+ __bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 0);
405
+ else
406
+ __bmma_m8n8k128_ld_c((int *)&a, (const int*)p, ldm, 1);
407
+ }
408
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
409
+
410
+
411
+
412
+ #ifdef __CUDA_AMPERE_MMA__
413
+ // load functions for frags of shape m16n16k8
414
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
415
+ __mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 0);
416
+ }
417
+
418
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
419
+ __mma_tf32_m16n16k8_ld_a((int *)&a, (const int *)p, ldm, 1);
420
+ }
421
+
422
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& a, const float* p, unsigned ldm) {
423
+ __mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 0);
424
+ }
425
+
426
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& a, const float* p, unsigned ldm) {
427
+ __mma_tf32_m16n16k8_ld_b((int *)&a, (const int *)p, ldm, 1);
428
+ }
429
+
430
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 16, 16, 8, float>& a, const float* p, unsigned ldm, layout_t layout) {
431
+ if (layout == mem_row_major)
432
+ __mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 0);
433
+ else
434
+ __mma_tf32_m16n16k8_ld_c((float *)&a, p, ldm, 1);
435
+ }
436
+
437
+ // load functions for frags of shape m8n8k4
438
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
439
+ __dmma_m8n8k4_ld_a((double *)&a, p, ldm, 0);
440
+ }
441
+
442
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_a, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
443
+ __dmma_m8n8k4_ld_a((double *)&a, p, ldm, 1);
444
+ }
445
+
446
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, row_major>& a, const double* p, unsigned ldm) {
447
+ __dmma_m8n8k4_ld_b((double *)&a, p, ldm, 0);
448
+ }
449
+
450
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<matrix_b, 8, 8, 4, double, col_major>& a, const double* p, unsigned ldm) {
451
+ __dmma_m8n8k4_ld_b((double *)&a, p, ldm, 1);
452
+ }
453
+
454
+ __CUDA_MMA_DEVICE_DECL__ void load_matrix_sync(fragment<accumulator, 8, 8, 4, double>& a, const double* p, unsigned ldm, layout_t layout) {
455
+ if (layout == mem_row_major)
456
+ __dmma_m8n8k4_ld_c((double *)&a, p, ldm, 0);
457
+ else
458
+ __dmma_m8n8k4_ld_c((double *)&a, p, ldm, 1);
459
+ }
460
+ #endif /* __CUDA_AMPERE_MMA__ */
461
+
462
+ //
463
+ // Store functions for frags of shape m16n16k16
464
+ //
465
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator,16, 16, 16, __half>& a, unsigned ldm, layout_t layout) {
466
+ if (layout == mem_row_major)
467
+ __hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
468
+ else
469
+ __hmma_m16n16k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
470
+ }
471
+
472
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator,16, 16, 16, float>& a, unsigned ldm, layout_t layout) {
473
+ if (layout == mem_row_major)
474
+ __hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
475
+ else
476
+ __hmma_m16n16k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
477
+ }
478
+
479
+ #ifdef __CUDA_IMMA__
480
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator,16, 16, 16, int>& a, unsigned ldm, layout_t layout) {
481
+ if (layout == mem_row_major)
482
+ __imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 0);
483
+ else
484
+ __imma_m16n16k16_st_c_i32(p, (const int*)&a, ldm, 1);
485
+ }
486
+ #endif /* __CUDA_IMMA__ */
487
+
488
+ //
489
+ // Store functions for frags of shape m32n8k16
490
+ //
491
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 32, 8, 16, __half>& a, unsigned ldm, layout_t layout) {
492
+ if (layout == mem_row_major)
493
+ __hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
494
+ else
495
+ __hmma_m32n8k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
496
+ }
497
+
498
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 32, 8, 16, float>& a, unsigned ldm, layout_t layout) {
499
+ if (layout == mem_row_major)
500
+ __hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
501
+ else
502
+ __hmma_m32n8k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
503
+ }
504
+
505
+ #ifdef __CUDA_IMMA__
506
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 32, 8, 16, int>& a, unsigned ldm, layout_t layout) {
507
+ if (layout == mem_row_major)
508
+ __imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 0);
509
+ else
510
+ __imma_m32n8k16_st_c_i32(p, (const int*)&a, ldm, 1);
511
+ }
512
+ #endif /* __CUDA_IMMA__ */
513
+
514
+ //
515
+ // Store functions for frags of shape m8n32k16
516
+ //
517
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(__half *p, const fragment<accumulator, 8, 32, 16, __half>& a, unsigned ldm, layout_t layout) {
518
+ if (layout == mem_row_major)
519
+ __hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 0);
520
+ else
521
+ __hmma_m8n32k16_st_c_f16((int*)p, (int*)&a, ldm, 1);
522
+ }
523
+
524
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 8, 32, 16, float>& a, unsigned ldm, layout_t layout) {
525
+ if (layout == mem_row_major)
526
+ __hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 0);
527
+ else
528
+ __hmma_m8n32k16_st_c_f32((float*)p, (float*)&a, ldm, 1);
529
+ }
530
+
531
+ #ifdef __CUDA_IMMA__
532
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 32, 16, int>& a, unsigned ldm, layout_t layout) {
533
+ if (layout == mem_row_major)
534
+ __imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 0);
535
+ else
536
+ __imma_m8n32k16_st_c_i32(p, (const int*)&a, ldm, 1);
537
+ }
538
+ #endif /* __CUDA_IMMA__ */
539
+
540
+ #ifdef __CUDA_SUBBYTE_IMMA__
541
+ //
542
+ // Store functions for frags of shape m8n8k32
543
+ //
544
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 32, int>& a, unsigned ldm, layout_t layout) {
545
+ if (layout == mem_row_major)
546
+ __imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 0);
547
+ else
548
+ __imma_m8n8k32_st_c_i32(p, (const int*)&a, ldm, 1);
549
+ }
550
+
551
+ //
552
+ // Store functions for frags of shape m8n8k128
553
+ //
554
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(int *p, const fragment<accumulator, 8, 8, 128, int>& a, unsigned ldm, layout_t layout) {
555
+ if (layout == mem_row_major)
556
+ __bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 0);
557
+ else
558
+ __bmma_m8n8k128_st_c_i32(p, (const int*)&a, ldm, 1);
559
+ }
560
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
561
+
562
+
563
+ #ifdef __CUDA_AMPERE_MMA__
564
+
565
+ //
566
+ // Store functions for frags of shape m16n16k8
567
+ //
568
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(float *p, const fragment<accumulator, 16, 16, 8, float>& a, unsigned ldm, layout_t layout) {
569
+ if (layout == mem_row_major)
570
+ __mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 0);
571
+ else
572
+ __mma_m16n16k8_st_c_f32(p, (const float*)&a, ldm, 1);
573
+ }
574
+
575
+
576
+ //
577
+ // Store functions for frags of shape m8n8k4
578
+ //
579
+ __CUDA_MMA_DEVICE_DECL__ void store_matrix_sync(double *p, const fragment<accumulator, 8, 8, 4, double>& a, unsigned ldm, layout_t layout) {
580
+ if (layout == mem_row_major)
581
+ __dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 0);
582
+ else
583
+ __dmma_m8n8k4_st_c_f64(p, (const double*)&a, ldm, 1);
584
+ }
585
+ #endif /* __CUDA_AMPERE_MMA__ */
586
+
587
+ //
588
+ // MMA functions for shape m16n16k16
589
+ //
590
+ // D fp16, C fp16
591
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
592
+ __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
593
+ }
594
+
595
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
596
+ __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
597
+ }
598
+
599
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
600
+ __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
601
+ }
602
+
603
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
604
+ __hmma_m16n16k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
605
+ }
606
+
607
+ // D fp32, C fp16
608
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
609
+ __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
610
+ }
611
+
612
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
613
+ __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
614
+ }
615
+
616
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
617
+ __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
618
+ }
619
+
620
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, __half>& c) {
621
+ __hmma_m16n16k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
622
+ }
623
+
624
+ // D fp32, C fp32
625
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
626
+ __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
627
+ }
628
+
629
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
630
+ __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
631
+ }
632
+
633
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
634
+ __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
635
+ }
636
+
637
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
638
+ __hmma_m16n16k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
639
+ }
640
+
641
+ // D fp16, C fp32
642
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
643
+ __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
644
+ }
645
+
646
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
647
+ __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
648
+ }
649
+
650
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, row_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
651
+ __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
652
+ }
653
+
654
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, __half>& d, const fragment<matrix_a, 16, 16, 16, __half, col_major>& a, const fragment<matrix_b,16, 16, 16, __half, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
655
+ __hmma_m16n16k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
656
+ }
657
+
658
+ #ifdef __CUDA_IMMA__
659
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
660
+ if (satf)
661
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
662
+ else
663
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
664
+ }
665
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
666
+ if (satf)
667
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
668
+ else
669
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
670
+ }
671
+
672
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, row_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
673
+ if (satf)
674
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
675
+ else
676
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
677
+ }
678
+
679
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, signed char, col_major>& a, const fragment<matrix_b,16, 16, 16, signed char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
680
+ if (satf)
681
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
682
+ else
683
+ __imma_m16n16k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
684
+ }
685
+
686
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
687
+ if (satf)
688
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 1);
689
+ else
690
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 1, 0);
691
+ }
692
+
693
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, col_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
694
+ if (satf)
695
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 1);
696
+ else
697
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 3, 0);
698
+ }
699
+
700
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, row_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
701
+ if (satf)
702
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 1);
703
+ else
704
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 0, 0);
705
+ }
706
+
707
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, int>& d, const fragment<matrix_a, 16, 16, 16, unsigned char, col_major>& a, const fragment<matrix_b,16, 16, 16, unsigned char, row_major>& b, const fragment<accumulator,16, 16, 16, int>& c, bool satf) {
708
+ if (satf)
709
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 1);
710
+ else
711
+ __imma_m16n16k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int *)&c, 2, 0);
712
+ }
713
+ #endif /* __CUDA_IMMA__ */
714
+
715
+ #ifdef __CUDA_AMPERE_MMA__
716
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
717
+ __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
718
+ }
719
+
720
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
721
+ __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
722
+ }
723
+
724
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
725
+ __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
726
+ }
727
+
728
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,16, 16, 16, float>& d, const fragment<matrix_a, 16, 16, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b,16, 16, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator,16, 16, 16, float>& c) {
729
+ __mma_bf16_m16n16k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
730
+ }
731
+ #endif /* __CUDA_AMPERE_MMA__ */
732
+
733
+
734
+ //
735
+ // MMA functions for shape m32n8k16
736
+ //
737
+ // D fp16, C fp16
738
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
739
+ __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
740
+ }
741
+
742
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
743
+ __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
744
+ }
745
+
746
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
747
+ __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
748
+ }
749
+
750
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
751
+ __hmma_m32n8k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
752
+ }
753
+
754
+ // D fp32, C fp16
755
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
756
+ __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
757
+ }
758
+
759
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
760
+ __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
761
+ }
762
+
763
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
764
+ __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
765
+ }
766
+
767
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, __half>& c) {
768
+ __hmma_m32n8k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
769
+ }
770
+
771
+ // D fp32, C fp32
772
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
773
+ __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
774
+ }
775
+
776
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
777
+ __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
778
+ }
779
+
780
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
781
+ __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
782
+ }
783
+
784
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
785
+ __hmma_m32n8k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
786
+ }
787
+
788
+ // D fp16, C fp32
789
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
790
+ __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
791
+ }
792
+
793
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, col_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
794
+ __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
795
+ }
796
+
797
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, row_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
798
+ __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
799
+ }
800
+
801
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,32, 8, 16, __half>& d, const fragment<matrix_a, 32, 8, 16, __half, col_major>& a, const fragment<matrix_b,32, 8, 16, __half, row_major>& b, const fragment<accumulator,32, 8, 16, float>& c) {
802
+ __hmma_m32n8k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
803
+ }
804
+
805
+ #ifdef __CUDA_IMMA__
806
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
807
+ if (satf)
808
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
809
+ else
810
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
811
+ }
812
+
813
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
814
+ if (satf)
815
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
816
+ else
817
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
818
+ }
819
+
820
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, row_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
821
+ if (satf)
822
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
823
+ else
824
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
825
+ }
826
+
827
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, signed char, col_major>& a, const fragment<matrix_b, 32, 8, 16, signed char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
828
+ if (satf)
829
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
830
+ else
831
+ __imma_m32n8k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
832
+ }
833
+
834
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
835
+ if (satf)
836
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
837
+ else
838
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
839
+ }
840
+
841
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, col_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
842
+ if (satf)
843
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
844
+ else
845
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
846
+
847
+ }
848
+
849
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, row_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
850
+ if (satf)
851
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
852
+ else
853
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
854
+
855
+ }
856
+
857
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, int>& d, const fragment<matrix_a, 32, 8, 16, unsigned char, col_major>& a, const fragment<matrix_b, 32, 8, 16, unsigned char, row_major>& b, const fragment<accumulator, 32, 8, 16, int>& c, bool satf) {
858
+ if (satf)
859
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
860
+ else
861
+ __imma_m32n8k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
862
+
863
+ }
864
+ #endif /* __CUDA_IMMA__ */
865
+
866
+ #ifdef __CUDA_AMPERE_MMA__
867
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
868
+ __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
869
+ }
870
+
871
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
872
+ __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
873
+ }
874
+
875
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
876
+ __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
877
+ }
878
+
879
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 32, 8, 16, float>& d, const fragment<matrix_a, 32, 8, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 32, 8, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 32, 8, 16, float>& c) {
880
+ __mma_bf16_m32n8k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
881
+ }
882
+ #endif /* __CUDA_AMPERE_MMA__ */
883
+
884
+ //
885
+ // MMA functions for shape m8n32k16
886
+ //
887
+ // D fp16, C fp16
888
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
889
+ __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
890
+ }
891
+
892
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
893
+ __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
894
+ }
895
+
896
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
897
+ __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
898
+ }
899
+
900
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
901
+ __hmma_m8n32k16_mma_f16f16((int*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
902
+ }
903
+
904
+ // D fp32, C fp16
905
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
906
+ __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 1, 0);
907
+ }
908
+
909
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
910
+ __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 3, 0);
911
+ }
912
+
913
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
914
+ __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 0, 0);
915
+ }
916
+
917
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, __half>& c) {
918
+ __hmma_m8n32k16_mma_f32f16((float*)&d, (const int*)&a, (const int*)&b, (const int*)&c, 2, 0);
919
+ }
920
+
921
+ // D fp32, C fp32
922
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
923
+ __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
924
+ }
925
+
926
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
927
+ __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
928
+ }
929
+
930
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
931
+ __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
932
+ }
933
+
934
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
935
+ __hmma_m8n32k16_mma_f32f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
936
+ }
937
+
938
+ // D fp16, C fp32
939
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
940
+ __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
941
+ }
942
+
943
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, col_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
944
+ __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
945
+ }
946
+
947
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, row_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
948
+ __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
949
+ }
950
+
951
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator,8, 32, 16, __half>& d, const fragment<matrix_a, 8, 32, 16, __half, col_major>& a, const fragment<matrix_b,8, 32, 16, __half, row_major>& b, const fragment<accumulator,8, 32, 16, float>& c) {
952
+ __hmma_m8n32k16_mma_f16f32((int*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
953
+ }
954
+
955
+ #ifdef __CUDA_IMMA__
956
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
957
+ if (satf)
958
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
959
+ else
960
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
961
+ }
962
+
963
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
964
+ if (satf)
965
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
966
+ else
967
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
968
+ }
969
+
970
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, row_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
971
+ if (satf)
972
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
973
+ else
974
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
975
+ }
976
+
977
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, signed char, col_major>& a, const fragment<matrix_b, 8, 32, 16, signed char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
978
+ if (satf)
979
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
980
+ else
981
+ __imma_m8n32k16_mma_s8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
982
+ }
983
+
984
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
985
+ if (satf)
986
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
987
+ else
988
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
989
+ }
990
+
991
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, col_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
992
+ if (satf)
993
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 1);
994
+ else
995
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 3, 0);
996
+ }
997
+
998
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, row_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
999
+ if (satf)
1000
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 1);
1001
+ else
1002
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 0, 0);
1003
+ }
1004
+
1005
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, int>& d, const fragment<matrix_a, 8, 32, 16, unsigned char, col_major>& a, const fragment<matrix_b, 8, 32, 16, unsigned char, row_major>& b, const fragment<accumulator, 8, 32, 16, int>& c, bool satf) {
1006
+ if (satf)
1007
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 1);
1008
+ else
1009
+ __imma_m8n32k16_mma_u8((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 2, 0);
1010
+ }
1011
+ #endif /* __CUDA_IMMA__ */
1012
+
1013
+ #ifdef __CUDA_AMPERE_MMA__
1014
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
1015
+ __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
1016
+ }
1017
+
1018
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, col_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
1019
+ __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
1020
+ }
1021
+
1022
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, row_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
1023
+ __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
1024
+ }
1025
+
1026
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 32, 16, float>& d, const fragment<matrix_a, 8, 32, 16, __nv_bfloat16, col_major>& a, const fragment<matrix_b, 8, 32, 16, __nv_bfloat16, row_major>& b, const fragment<accumulator, 8, 32, 16, float>& c) {
1027
+ __mma_bf16_m8n32k16_mma_f32((float*)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
1028
+ }
1029
+ #endif /* __CUDA_AMPERE_MMA__ */
1030
+
1031
+
1032
+ #ifdef __CUDA_SUBBYTE_IMMA__
1033
+ //
1034
+ // MMA functions for shape m8n8k32
1035
+ //
1036
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
1037
+ if (satf)
1038
+ __imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
1039
+ else
1040
+ __imma_m8n8k32_mma_s4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
1041
+ }
1042
+
1043
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 32, int>& d, const fragment<matrix_a, 8, 8, 32, experimental::precision::u4, row_major>& a, const fragment<matrix_b, 8, 8, 32, experimental::precision::u4, col_major>& b, const fragment<accumulator, 8, 8, 32, int>& c, bool satf) {
1044
+ if (satf)
1045
+ __imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 1);
1046
+ else
1047
+ __imma_m8n8k32_mma_u4((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1, 0);
1048
+ }
1049
+
1050
+ //
1051
+ // MMA functions for shape m8n8k128
1052
+ //
1053
+ __CUDA_MMA_DEVICE_DECL__ void bmma_sync(fragment<accumulator, 8, 8, 128, int>& d, const fragment<matrix_a, 8, 8, 128, experimental::precision::b1, row_major>& a, const fragment<matrix_b, 8, 8, 128, experimental::precision::b1, col_major>& b, const fragment<accumulator, 8, 8, 128, int>& c,
1054
+ experimental::bmmaBitOp op, experimental::bmmaAccumulateOp)
1055
+ {
1056
+
1057
+ #ifdef __CUDA_AMPERE_MMA__
1058
+ if (op == experimental::bmmaBitOpAND)
1059
+ __bmma_m8n8k128_mma_and_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
1060
+ else
1061
+ #endif /* __CUDA_AMPERE_MMA__ */
1062
+ __bmma_m8n8k128_mma_xor_popc_b1((int*)&d, (const int *)&a, (const int *)&b, (const int*)&c, 1);
1063
+ }
1064
+
1065
+
1066
+ #endif /* __CUDA_SUBBYTE_IMMA__ */
1067
+
1068
+ #ifdef __CUDA_AMPERE_MMA__
1069
+ //
1070
+ // MMA functions for shape m16n16k8
1071
+ //
1072
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
1073
+ __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 1, 0);
1074
+ }
1075
+
1076
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, col_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
1077
+ __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 3, 0);
1078
+ }
1079
+
1080
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, row_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
1081
+ __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 0, 0);
1082
+ }
1083
+
1084
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 16, 16, 8, float>& d, const fragment<matrix_a, 16, 16, 8, precision::tf32, col_major>& a, const fragment<matrix_b, 16, 16, 8, precision::tf32, row_major>& b, const fragment<accumulator, 16, 16, 8, float>& c) {
1085
+ __mma_tf32_m16n16k8_mma_f32((float *)&d, (const int*)&a, (const int*)&b, (const float*)&c, 2, 0);
1086
+ }
1087
+
1088
+
1089
+ //
1090
+ // MMA functions for shape m8n8k4
1091
+ //
1092
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
1093
+ __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 1, 0);
1094
+ }
1095
+
1096
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, col_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
1097
+ __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 3, 0);
1098
+ }
1099
+
1100
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, row_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
1101
+ __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 0, 0);
1102
+ }
1103
+
1104
+ __CUDA_MMA_DEVICE_DECL__ void mma_sync(fragment<accumulator, 8, 8, 4, double>& d, const fragment<matrix_a, 8, 8, 4, double, col_major>& a, const fragment<matrix_b, 8, 8, 4, double, row_major>& b, const fragment<accumulator, 8, 8, 4, double>& c) {
1105
+ __dmma_m8n8k4_mma_f64((double *)&d, (const double*)&a, (const double*)&b, (const double*)&c, 2, 0);
1106
+ }
1107
+
1108
+ #endif /* __CUDA_AMPERE_MMA__ */
1109
+
1110
+ };
1111
+ };
1112
+
1113
+ #undef __CUDA_IMMA__
1114
+ #undef __CUDA_SUBBYTE_IMMA__
1115
+ #undef __CUDA_MMA_DEVICE_DECL__
1116
+ #undef __CUDA_AMPERE_MMA__
1117
+
1118
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
1119
+
1120
+ #endif /* __cplusplus && __CUDACC__ */
1121
+
1122
+
1123
+ #endif /* __CUDA_MMA_HPP__ */
1124
+
1125
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__)
1126
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
1127
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_HPP__
1128
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/nvfunctional ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * NVIDIA_COPYRIGHT_BEGIN
3
+ *
4
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
5
+ *
6
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
7
+ * and proprietary rights in and to this software, related documentation
8
+ * and any modifications thereto. Any use, reproduction, disclosure or
9
+ * distribution of this software and related documentation without an express
10
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
11
+ *
12
+ * NVIDIA_COPYRIGHT_END
13
+ */
14
+
15
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
16
+ #if defined(_MSC_VER)
17
+ #pragma message("crt/nvfunctional is an internal header file and must not be used directly. Please use nvfunctional instead.")
18
+ #else
19
+ #warning "crt/nvfunctional is an internal header file and must not be used directly. Please use nvfunctional instead."
20
+ #endif
21
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
22
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
23
+ #endif
24
+
25
+ #ifndef __NV_LIBCXX_FUNCTIONAL_H__
26
+ #define __NV_LIBCXX_FUNCTIONAL_H__
27
+
28
+ #if __cplusplus < 201103L
29
+ #if defined(_MSC_VER)
30
+ #if _MSC_VER < 1800
31
+ #error This library requires VS 2013 and above
32
+ #endif /* _MSC_VER < 1800 */
33
+ #else /* !_MSC_VER */
34
+ #error This library requires support for the ISO C++ 2011 standard
35
+ #endif /* _MSC_VER */
36
+ #endif /* __cplusplus */
37
+
38
+ #if defined(_MSC_VER)
39
+ #define __NV_ALIGNOF __alignof
40
+ #define __NV_NOEXCEPT
41
+ #define __NV_CONSTEXPR
42
+ #else /* !_MSC_VER */
43
+ #define __NV_ALIGNOF alignof
44
+ #define __NV_NOEXCEPT noexcept
45
+ #define __NV_CONSTEXPR constexpr
46
+ #endif /* _MSC_VER */
47
+
48
+ #include <type_traits>
49
+ #include <cstddef>
50
+ #include <new>
51
+
52
+ // n3290 20.8
53
+ namespace nvstd
54
+ {
55
+
56
+ namespace internal {
57
+
58
+ // D.8.1 base (deprecated) [depr.base]
59
+ template <class _Arg, class _Result>
60
+ struct unary_function
61
+ {
62
+ typedef _Arg argument_type;
63
+ typedef _Result result_type;
64
+ };
65
+
66
+ template <class _Arg1, class _Arg2, class _Result>
67
+ struct binary_function
68
+ {
69
+ typedef _Arg1 first_argument_type;
70
+ typedef _Arg2 second_argument_type;
71
+ typedef _Result result_type;
72
+ };
73
+
74
+ // move
75
+ template <class _T>
76
+ inline __device__ __host__
77
+ typename std::remove_reference<_T>::type&& move(_T&& __t) __NV_NOEXCEPT
78
+ {
79
+ return static_cast<typename std::remove_reference<_T>::type&&>(__t);
80
+ }
81
+
82
+ // 20.2.2 swap [utility.swap]
83
+ // swap
84
+ template<class _T,
85
+ class = typename std::enable_if<
86
+ std::is_move_constructible<_T>::value &&
87
+ std::is_move_assignable<_T>::value>::type>
88
+ inline __device__ __host__
89
+ void swap(_T& __a, _T& __b)
90
+ #if !defined(_MSC_VER)
91
+ noexcept(std::is_nothrow_move_constructible<_T>::value &&
92
+ std::is_nothrow_move_assignable<_T>::value)
93
+ #endif /* !defined(_MSC_VER) */
94
+ {
95
+ _T __t(internal::move(__a));
96
+ __a = internal::move(__b);
97
+ __b = internal::move(__t);
98
+ }
99
+
100
+ // 20.2.3 forward/move helpers [forward]
101
+ // forward
102
+ template <class _T>
103
+ inline __device__ __host__
104
+ _T&& forward(typename std::remove_reference<_T>::type& __t) __NV_NOEXCEPT
105
+ {
106
+ return static_cast<_T&&>(__t);
107
+ }
108
+
109
+ template <class _T>
110
+ inline __device__ __host__
111
+ _T&& forward(typename std::remove_reference<_T>::type&& __t) __NV_NOEXCEPT
112
+ {
113
+ static_assert(!std::is_lvalue_reference<_T>::value,
114
+ "Error: __t is instantiated with an lvalue reference type");
115
+ return static_cast<_T&&>(__t);
116
+ }
117
+
118
+ } // namespace internal
119
+
120
+ namespace __functional_helpers
121
+ {
122
+
123
+ struct __dummy_class;
124
+
125
+ // Store small functors locally:
126
+ // a functor is legitimate to local storage if it is one of the following types:
127
+ // * member object pointer;
128
+ // * member function pointer;
129
+ // * closure type of size less than or equal to the largest size of
130
+ // the above types;
131
+ // * function pointer;
132
+ // * any callable class whose size is less than or equal to
133
+ // the largest one of the above types;
134
+ union _Small_functor_types
135
+ {
136
+ void *__obj;
137
+ void (*__func_ptr)();
138
+ void (__dummy_class::*mem_fn_ptr)();
139
+ };
140
+
141
+ struct _Small_functor_data {
142
+ char __data[sizeof(_Small_functor_types)];
143
+ };
144
+
145
+ template <class _RetType, class ..._ArgTypes>
146
+ struct __maybe_base_function
147
+ { };
148
+
149
+ template <class _RetType, class _T1>
150
+ struct __maybe_base_function<_RetType(_T1)>
151
+ : public internal::unary_function<_T1, _RetType>
152
+ { };
153
+
154
+ template <class _RetType, class _T1, class _T2>
155
+ struct __maybe_base_function<_RetType(_T1, _T2)>
156
+ : public internal::binary_function<_T1, _T2, _RetType>
157
+ { };
158
+
159
+ } // namespace __functional_helpers
160
+
161
+ // 20.8.11 Polymorphic function wrappers [func.wrap]
162
+
163
+ // 20.8.11.1 Class bad_function_call [func.wrap.badcall]
164
+ // unimplemented because of exception
165
+ // class bad_function_call : public std::exception
166
+
167
+ // 20.8.11.2 Class template function [func.wrap.func]
168
+
169
+ template<class> class function; // undefined
170
+
171
+ // Simplified version of template class function, which
172
+ // * does not support allocator_arg_t;
173
+ // * does not support target and target_type that rely on RTTI
174
+ // * does not throw bad_function_call exception on invoking a NULL target
175
+ template <class _RetType, class ..._ArgTypes>
176
+ class function<_RetType(_ArgTypes...)>
177
+ : public __functional_helpers::__maybe_base_function<_RetType(_ArgTypes...)>
178
+ {
179
+ __functional_helpers::_Small_functor_data __small_functor_data;
180
+ void *__obj;
181
+ typedef _RetType(*__meta_fn_type)(void *, _ArgTypes...);
182
+ __meta_fn_type __meta_fn;
183
+ typedef void(*__cloner_type)(function &, const function &);
184
+ __cloner_type __cloner;
185
+ typedef void(*__destructor_type)(function *);
186
+ __destructor_type __destructor;
187
+
188
+ #pragma nv_exec_check_disable
189
+ template <class _F>
190
+ __device__ __host__
191
+ __NV_CONSTEXPR bool __use_small_functor_data() const
192
+ {
193
+ return (sizeof(_F) <= sizeof(__small_functor_data) &&
194
+ __NV_ALIGNOF(_F) <= __NV_ALIGNOF(
195
+ __functional_helpers::_Small_functor_types));
196
+ }
197
+
198
+ #pragma nv_exec_check_disable
199
+ __device__ __host__
200
+ void* __get_small_functor_data() const
201
+ {
202
+ return (void*)(&__small_functor_data.__data[0]);
203
+ }
204
+
205
+ #pragma nv_exec_check_disable
206
+ __device__ __host__
207
+ bool __is_small_functor_data() const
208
+ {
209
+ return __obj == __get_small_functor_data();
210
+ }
211
+
212
+ #pragma nv_exec_check_disable
213
+ template <class _F>
214
+ __device__ __host__
215
+ static _F& __get_functor(void *__p)
216
+ {
217
+ return *((_F*)__p);
218
+ }
219
+
220
+ #pragma nv_exec_check_disable
221
+ template <class _F>
222
+ __device__ __host__
223
+ static bool __is_empty_functor(const _F& /*__p*/)
224
+ {
225
+ return false;
226
+ }
227
+
228
+ #pragma nv_exec_check_disable
229
+ template <class _F>
230
+ __device__ __host__
231
+ static bool __is_empty_functor(const _F* __p)
232
+ {
233
+ return !__p;
234
+ }
235
+
236
+ #pragma nv_exec_check_disable
237
+ template <class _Res, class _C>
238
+ __device__ __host__
239
+ static bool __is_empty_functor(const _Res _C::* __p)
240
+ {
241
+ return !__p;
242
+ }
243
+
244
+ #pragma nv_exec_check_disable
245
+ template <class _Res, class... _Args>
246
+ __device__ __host__
247
+ static bool __is_empty_functor(const function<_Res(_Args...)>& __p)
248
+ {
249
+ return !__p;
250
+ }
251
+
252
+ template <class _F>
253
+ struct __make_cloner
254
+ {
255
+ #pragma nv_exec_check_disable
256
+ __device__ __host__
257
+ static void __clone_data(function &__dest, const function &__src)
258
+ {
259
+ if (__dest.__use_small_functor_data<_F>()) {
260
+ __dest.__obj = __dest.__get_small_functor_data();
261
+ new (__dest.__obj) _F(__src.__get_functor<_F>(__src.__obj));
262
+ }
263
+ else {
264
+ __dest.__obj = new _F(__src.__get_functor<_F>(__src.__obj));
265
+ }
266
+ }
267
+ };
268
+
269
+ template <class _F>
270
+ struct __make_destructor
271
+ {
272
+ #pragma nv_exec_check_disable
273
+ __device__ __host__
274
+ static void __destruct(function *__fn)
275
+ {
276
+ if (__fn->__use_small_functor_data<_F>()) {
277
+ (__fn->__get_functor<_F>(__fn->__obj)).~_F();
278
+ }
279
+ else {
280
+ delete (_F*)(__fn->__obj);
281
+ }
282
+ }
283
+ };
284
+
285
+ // We cannot simple define __make_functor in the following way:
286
+ // template <class _T, _F>
287
+ // __make_functor;
288
+ // template <class _RetType1, class _F, class... _ArgTypes1>
289
+ // struct __make_functor<_RetType1(_ArgTypes1...), _F>
290
+ //
291
+ // because VS 2013 cannot unpack _RetType1(_ArgTypes1...)
292
+ template <class _RetType1, class _F, class... _ArgTypes1>
293
+ struct __make_functor
294
+ {
295
+ typedef _RetType1 type;
296
+
297
+ #pragma nv_exec_check_disable
298
+ __device__ __host__
299
+ static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
300
+ {
301
+ return __get_functor<_F>(__d)(
302
+ internal::forward<_ArgTypes1>(__args)...);
303
+ }
304
+ };
305
+
306
+ template <class _RetType1, class _C, class _M, class... _ArgTypes1>
307
+ struct __make_functor<_RetType1, _M _C::*,_ArgTypes1...>
308
+ {
309
+ typedef _RetType1 type;
310
+ typedef _RetType1(*_Fn)(_ArgTypes1...);
311
+
312
+ #pragma nv_exec_check_disable
313
+ __device__ __host__
314
+ static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
315
+ {
316
+ return __get_functor<_Fn>(__d)(
317
+ internal::forward<_ArgTypes1>(__args)...);
318
+ }
319
+ };
320
+
321
+ // workaround for GCC version below 4.8
322
+ #if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
323
+ template <class _F>
324
+ struct __check_callability
325
+ : public std::integral_constant<bool,
326
+ !std::is_same<_F, std::nullptr_t>::value>
327
+ { };
328
+ #elif defined(_MSC_VER)
329
+ // simulate VC 2013's behavior...
330
+ template <class _F>
331
+ struct __check_callability1
332
+ : public
333
+ std::integral_constant<bool,
334
+ // std::result_of does not handle member pointers well
335
+ std::is_member_pointer<_F>::value ||
336
+ std::is_convertible<
337
+ _RetType,
338
+ typename std::result_of<_F(_ArgTypes...)>::type
339
+ >::value
340
+ >
341
+ { };
342
+
343
+ template <class _F>
344
+ struct __check_callability
345
+ : public std::integral_constant<
346
+ bool,
347
+ !std::is_same<_F, function>::value &&
348
+ __check_callability1<typename std::remove_cv<_F>::type>::value>
349
+ { };
350
+ #else /* !((__GNUC__ == 4) && (__GNUC_MINOR__ < 8)) _MSC_VER */
351
+ template <class _F,
352
+ class _T = typename std::result_of<_F(_ArgTypes...)>::type>
353
+ struct __check_callability
354
+ : public std::integral_constant<
355
+ bool,
356
+ !std::is_same<_F, function>::value &&
357
+ std::is_convertible< _T, _RetType>::value>
358
+ { };
359
+ #endif /* __GNUC__ == 4) && (__GNUC_MINOR__ < 8) */
360
+
361
+ #pragma nv_exec_check_disable
362
+ __device__ __host__
363
+ void __destroy()
364
+ {
365
+ if (__obj) {
366
+ __destructor(this);
367
+ __obj = 0;
368
+ }
369
+ }
370
+
371
+ #pragma nv_exec_check_disable
372
+ __device__ __host__
373
+ void __clear()
374
+ {
375
+ __obj = 0;
376
+ __meta_fn = 0;
377
+ __cloner = 0;
378
+ __destructor = 0;
379
+ }
380
+
381
+ public:
382
+ typedef _RetType result_type;
383
+
384
+ /*
385
+ * These typedef(s) are derived from __maybe_base_function
386
+ * typedef T1 argument_type; // only if sizeof...(ArgTypes) == 1 and
387
+ * // the type in ArgTypes is T1
388
+ * typedef T1 first_argument_type; // only if sizeof...(ArgTypes) == 2 and
389
+ * // ArgTypes contains T1 and T2
390
+ * typedef T2 second_argument_type; // only if sizeof...(ArgTypes) == 2 and
391
+ * // ArgTypes contains T1 and T2
392
+ */
393
+
394
+ // 20.8.11.2.1 construct/copy/destroy [func.wrap.con]
395
+
396
+ #pragma nv_exec_check_disable
397
+ __device__ __host__
398
+ function() __NV_NOEXCEPT
399
+ : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
400
+
401
+ #pragma nv_exec_check_disable
402
+ __device__ __host__
403
+ function(std::nullptr_t) __NV_NOEXCEPT
404
+ : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
405
+
406
+ #pragma nv_exec_check_disable
407
+ __device__ __host__
408
+ function(const function &__fn)
409
+ {
410
+ if (__fn.__obj == 0) {
411
+ __clear();
412
+ }
413
+ else {
414
+ __meta_fn = __fn.__meta_fn;
415
+ __destructor = __fn.__destructor;
416
+ __fn.__cloner(*this, __fn);
417
+ __cloner = __fn.__cloner;
418
+ }
419
+ }
420
+
421
+ #pragma nv_exec_check_disable
422
+ __device__ __host__
423
+ function(function &&__fn)
424
+ {
425
+ __fn.swap(*this);
426
+ }
427
+
428
+ // VS 2013 cannot process __check_callability type trait.
429
+ // So, we check callability using static_assert instead of
430
+ // using SFINAE such as
431
+ // template<class _F,
432
+ // class = typename std::enable_if<
433
+ // __check_callability<_F>::value
434
+ // >::type>
435
+
436
+ #pragma nv_exec_check_disable
437
+ template<class _F>
438
+ __device__ __host__
439
+ function(_F);
440
+
441
+ // copy and swap
442
+ #pragma nv_exec_check_disable
443
+ __device__ __host__
444
+ function& operator=(const function& __fn)
445
+ {
446
+ function(__fn).swap(*this);
447
+ return *this;
448
+ }
449
+
450
+ #pragma nv_exec_check_disable
451
+ __device__ __host__
452
+ function& operator=(function&& __fn)
453
+ {
454
+ function(internal::move(__fn)).swap(*this);
455
+ return *this;
456
+ }
457
+
458
+ #pragma nv_exec_check_disable
459
+ __device__ __host__
460
+ function& operator=(std::nullptr_t)
461
+ {
462
+ __destroy();
463
+ return *this;
464
+ }
465
+
466
+ #pragma nv_exec_check_disable
467
+ template<class _F>
468
+ __device__ __host__
469
+ function&
470
+ operator=(_F&& __fn)
471
+ {
472
+ static_assert(__check_callability<_F>::value,
473
+ "Unable to create functor object!");
474
+ function(internal::forward<_F>(__fn)).swap(*this);
475
+ return *this;
476
+ }
477
+
478
+ #pragma nv_exec_check_disable
479
+ __device__ __host__
480
+ ~function()
481
+ {
482
+ __destroy();
483
+ }
484
+
485
+ // 20.8.11.2.2 function modifiers [func.wrap.func.mod]
486
+ #pragma nv_exec_check_disable
487
+ __device__ __host__
488
+ void swap(function& __fn) __NV_NOEXCEPT
489
+ {
490
+ internal::swap(__meta_fn, __fn.__meta_fn);
491
+ internal::swap(__cloner, __fn.__cloner);
492
+ internal::swap(__destructor, __fn.__destructor);
493
+
494
+ if (__is_small_functor_data() && __fn.__is_small_functor_data()) {
495
+ internal::swap(__small_functor_data, __fn.__small_functor_data);
496
+ }
497
+ else if (__is_small_functor_data()) {
498
+ internal::swap(__small_functor_data, __fn.__small_functor_data);
499
+ internal::swap(__obj, __fn.__obj);
500
+ __fn.__obj = __fn.__get_small_functor_data();
501
+ }
502
+ else if (__fn.__is_small_functor_data()) {
503
+ internal::swap(__small_functor_data, __fn.__small_functor_data);
504
+ internal::swap(__obj, __fn.__obj);
505
+ __obj = __get_small_functor_data();
506
+ }
507
+ else {
508
+ internal::swap(__obj, __fn.__obj);
509
+ }
510
+ }
511
+
512
+ // 20.8.11.2.3 function capacity [func.wrap.func.cap]
513
+ #pragma nv_exec_check_disable
514
+ __device__ __host__
515
+ explicit operator bool() const __NV_NOEXCEPT
516
+ {
517
+ return __obj;
518
+ }
519
+
520
+ // 20.8.11.2.4 function invocation [func.wrap.func.inv]
521
+ // function::operator() can only be called in device code
522
+ // to avoid cross-execution space calls
523
+ #pragma nv_exec_check_disable
524
+ __device__ __host__
525
+ _RetType operator()(_ArgTypes...) const;
526
+
527
+ };
528
+
529
+ // Out-of-line definitions
530
+ #pragma nv_exec_check_disable
531
+ template<class _RetType, class... _ArgTypes>
532
+ template<class _F>
533
+ __device__ __host__
534
+ function<_RetType(_ArgTypes...)>::function(_F __fn)
535
+ : __obj(0), __meta_fn(0), __cloner(0), __destructor(0)
536
+ {
537
+ static_assert(__check_callability<_F>::value,
538
+ "Unable to construct functor object!");
539
+ if (__is_empty_functor(__fn))
540
+ return;
541
+ __meta_fn = &__make_functor<_RetType, _F, _ArgTypes...>::__invoke;
542
+ __cloner = &__make_cloner<_F>::__clone_data;
543
+ __destructor = &__make_destructor<_F>::__destruct;
544
+
545
+ if (__use_small_functor_data<_F>()) {
546
+ __obj = __get_small_functor_data();
547
+ new ((void*)__obj) _F(internal::move(__fn));
548
+ }
549
+ else {
550
+ __obj = new _F(internal::move(__fn));
551
+ }
552
+ }
553
+
554
+ #pragma nv_exec_check_disable
555
+ template <class _RetType, class..._ArgTypes>
556
+ __device__ __host__
557
+ _RetType
558
+ function<_RetType(_ArgTypes...)>::operator()(_ArgTypes... __args) const
559
+ {
560
+ return __meta_fn(__obj, internal::forward<_ArgTypes>(__args)...);
561
+ }
562
+
563
+ // 20.8.11.2.6, Null pointer comparisons:
564
+
565
+ #pragma nv_exec_check_disable
566
+ template <class _R, class... _ArgTypes>
567
+ __device__ __host__
568
+ bool operator==(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
569
+ __NV_NOEXCEPT
570
+ {
571
+ return !__fn;
572
+ }
573
+
574
+ #pragma nv_exec_check_disable
575
+ template <class _R, class... _ArgTypes>
576
+ __device__ __host__
577
+ bool operator==(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
578
+ __NV_NOEXCEPT
579
+ {
580
+ return !__fn;
581
+ }
582
+
583
+ #pragma nv_exec_check_disable
584
+ template <class _R, class... _ArgTypes>
585
+ __device__ __host__
586
+ bool operator!=(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
587
+ __NV_NOEXCEPT
588
+ {
589
+ return static_cast<bool>(__fn);
590
+ }
591
+
592
+ #pragma nv_exec_check_disable
593
+ template <class _R, class... _ArgTypes>
594
+ __device__ __host__
595
+ bool operator!=(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
596
+ __NV_NOEXCEPT
597
+ {
598
+ return static_cast<bool>(__fn);
599
+ }
600
+
601
+ // 20.8.11.2.7, specialized algorithms:
602
+ #pragma nv_exec_check_disable
603
+ template <class _R, class... _ArgTypes>
604
+ __device__ __host__
605
+ void swap(function<_R(_ArgTypes...)>& __fn1, function<_R(_ArgTypes...)>& __fn2)
606
+ {
607
+ __fn1.swap(__fn2);
608
+ }
609
+
610
+ } // namespace nvstd
611
+
612
+ #undef __NV_NOEXCEPT
613
+ #undef __NV_CONSTEXPR
614
+ #undef __NV_ALIGNOF
615
+
616
+ #endif // __NV_LIBCXX_FUNCTIONAL_H__
617
+
618
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__)
619
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
620
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
621
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
51
+ #define EXCLUDE_FROM_RTC
52
+
53
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
54
+ #if defined(_MSC_VER)
55
+ #pragma message("crt/sm_70_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
56
+ #else
57
+ #warning "crt/sm_70_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
58
+ #endif
59
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
60
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
61
+ #endif
62
+
63
+ #if !defined(__SM_70_RT_H__)
64
+ #define __SM_70_RT_H__
65
+
66
+ #if defined(__CUDACC_RTC__)
67
+ #define __SM_70_RT_DECL__ __host__ __device__
68
+ #elif defined(_NVHPC_CUDA)
69
+ #define __SM_70_RT_DECL__ extern __device__ __cudart_builtin__
70
+ #else /* !__CUDACC_RTC__ */
71
+ #define __SM_70_RT_DECL__ static __device__ __inline__
72
+ #endif /* __CUDACC_RTC__ */
73
+
74
+ #if defined(__cplusplus) && defined(__CUDACC__)
75
+
76
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
77
+
78
+ /*******************************************************************************
79
+ * *
80
+ * *
81
+ * *
82
+ *******************************************************************************/
83
+
84
+ #include "builtin_types.h"
85
+ #include "device_types.h"
86
+ #include "host_defines.h"
87
+
88
+ #if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
89
+ #define __DEF_IF_HOST { }
90
+ #else /* !__CUDA_ARCH__ */
91
+ #define __DEF_IF_HOST ;
92
+ #endif /* __CUDA_ARCH__ */
93
+
94
+
95
+ /******************************************************************************
96
+ * match *
97
+ ******************************************************************************/
98
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) __DEF_IF_HOST
99
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) __DEF_IF_HOST
100
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) __DEF_IF_HOST
101
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) __DEF_IF_HOST
102
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) __DEF_IF_HOST
103
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) __DEF_IF_HOST
104
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) __DEF_IF_HOST
105
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) __DEF_IF_HOST
106
+
107
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) __DEF_IF_HOST
108
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) __DEF_IF_HOST
109
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) __DEF_IF_HOST
110
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) __DEF_IF_HOST
111
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) __DEF_IF_HOST
112
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) __DEF_IF_HOST
113
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) __DEF_IF_HOST
114
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) __DEF_IF_HOST
115
+
116
+ __SM_70_RT_DECL__ void __nanosleep(unsigned int ns) __DEF_IF_HOST
117
+
118
+ __SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) __DEF_IF_HOST
119
+
120
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
121
+
122
+ #endif /* __cplusplus && __CUDACC__ */
123
+
124
+ #undef __DEF_IF_HOST
125
+ #undef __SM_70_RT_DECL__
126
+
127
+ #if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
128
+ #include "sm_70_rt.hpp"
129
+ #endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
130
+
131
+ #endif /* !__SM_70_RT_H__ */
132
+
133
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__)
134
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
135
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
136
+ #endif
137
+
138
+
139
+ #undef EXCLUDE_FROM_RTC
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/sm_70_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/sm_70_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
58
+ #endif
59
+
60
+ #if !defined(__SM_70_RT_HPP__)
61
+ #define __SM_70_RT_HPP__
62
+
63
+ #if defined(__CUDACC_RTC__)
64
+ #define __SM_70_RT_DECL__ __host__ __device__
65
+ #else /* !__CUDACC_RTC__ */
66
+ #define __SM_70_RT_DECL__ static __device__ __inline__
67
+ #endif /* __CUDACC_RTC__ */
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
72
+
73
+ /*******************************************************************************
74
+ * *
75
+ * *
76
+ * *
77
+ *******************************************************************************/
78
+
79
+ #include "builtin_types.h"
80
+ #include "device_types.h"
81
+ #include "host_defines.h"
82
+
83
+ /*******************************************************************************
84
+ * *
85
+ * Below are implementations of SM-7.0 builtin functions which are included as *
86
+ * source (instead of being built in to the compiler) *
87
+ * *
88
+ *******************************************************************************/
89
+
90
+ //
91
+ // __match_any_sync
92
+ //
93
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) {
94
+ return __match32_any_sync(mask, value);
95
+ }
96
+
97
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) {
98
+ return __match32_any_sync(mask, value);
99
+ }
100
+
101
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) {
102
+ return (sizeof(long) == sizeof(long long)) ?
103
+ __match64_any_sync(mask, (unsigned long long)value):
104
+ __match32_any_sync(mask, (unsigned)value);
105
+ }
106
+
107
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) {
108
+ return (sizeof(long) == sizeof(long long)) ?
109
+ __match64_any_sync(mask, (unsigned long long)value):
110
+ __match32_any_sync(mask, (unsigned)value);
111
+ }
112
+
113
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) {
114
+ return __match64_any_sync(mask, value);
115
+ }
116
+
117
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) {
118
+ return __match64_any_sync(mask, value);
119
+ }
120
+
121
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) {
122
+ return __match32_any_sync(mask, __float_as_uint(value));
123
+ }
124
+
125
+ __SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) {
126
+ return __match64_any_sync(mask, __double_as_longlong(value));
127
+ }
128
+
129
+ //
130
+ // __match_all_sync
131
+ //
132
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) {
133
+ return __match32_all_sync(mask, value, pred);
134
+ }
135
+
136
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) {
137
+ return __match32_all_sync(mask, value, pred);
138
+ }
139
+
140
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) {
141
+ return (sizeof(long) == sizeof(long long)) ?
142
+ __match64_all_sync(mask, (unsigned long long)value, pred):
143
+ __match32_all_sync(mask, (unsigned)value, pred);
144
+ }
145
+
146
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) {
147
+ return (sizeof(long) == sizeof(long long)) ?
148
+ __match64_all_sync(mask, (unsigned long long)value, pred):
149
+ __match32_all_sync(mask, (unsigned)value, pred);
150
+ }
151
+
152
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) {
153
+ return __match64_all_sync(mask, value, pred);
154
+ }
155
+
156
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) {
157
+ return __match64_all_sync(mask, value, pred);
158
+ }
159
+
160
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) {
161
+ return __match32_all_sync(mask, __float_as_uint(value), pred);
162
+ }
163
+
164
+ __SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) {
165
+ return __match64_all_sync(mask, __double_as_longlong(value), pred);
166
+ }
167
+
168
+ __SM_70_RT_DECL__ void __nanosleep(unsigned int ns) {
169
+ asm volatile("nanosleep.u32 %0;" :: "r"(ns));
170
+ }
171
+
172
+
173
+ extern "C" __device__ __device_builtin__
174
+ unsigned short __usAtomicCAS(unsigned short *, unsigned short, unsigned short);
175
+
176
+ __SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) {
177
+ return __usAtomicCAS(address, compare, val);
178
+ }
179
+
180
+
181
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
182
+
183
+ #endif /* __cplusplus && __CUDACC__ */
184
+
185
+ #undef __SM_70_RT_DECL__
186
+
187
+ #endif /* !__SM_70_RT_HPP__ */
188
+
189
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__)
190
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
191
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
192
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.h ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/sm_80_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/sm_80_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
58
+ #endif
59
+
60
+ #if !defined(__SM_80_RT_H__)
61
+ #define __SM_80_RT_H__
62
+
63
+ #if defined(__CUDACC_RTC__)
64
+ #define __SM_80_RT_DECL__ __host__ __device__
65
+ #elif defined(_NVHPC_CUDA)
66
+ #define __SM_80_RT_DECL__ extern __device__ __cudart_builtin__
67
+ #else /* !__CUDACC_RTC__ */
68
+ #define __SM_80_RT_DECL__ static __device__ __inline__
69
+ #endif /* __CUDACC_RTC__ */
70
+
71
+ #if defined(__cplusplus) && defined(__CUDACC__)
72
+
73
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
74
+
75
+ /*******************************************************************************
76
+ * *
77
+ * *
78
+ * *
79
+ *******************************************************************************/
80
+
81
+ #include "builtin_types.h"
82
+ #include "device_types.h"
83
+ #include "host_defines.h"
84
+
85
+ #if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
86
+ #define __DEF_IF_HOST { }
87
+ #else /* !__CUDA_ARCH__ */
88
+ #define __DEF_IF_HOST ;
89
+ #endif /* __CUDA_ARCH__ */
90
+
91
+
92
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
93
+ #define EXCLUDE_FROM_RTC
94
+ /******************************************************************************
95
+ * reduce *
96
+ ******************************************************************************/
97
+ __SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) __DEF_IF_HOST
98
+ __SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) __DEF_IF_HOST
99
+ __SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) __DEF_IF_HOST
100
+
101
+ __SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) __DEF_IF_HOST
102
+ __SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) __DEF_IF_HOST
103
+ __SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) __DEF_IF_HOST
104
+
105
+ __SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) __DEF_IF_HOST
106
+ __SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) __DEF_IF_HOST
107
+ __SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) __DEF_IF_HOST
108
+
109
+ #undef EXCLUDE_FROM_RTC
110
+
111
+
112
+ extern "C" {
113
+ inline __device__ void *__nv_associate_access_property(const void *ptr,
114
+ unsigned long long property) {
115
+ extern __device__ void *__nv_associate_access_property_impl(const void *,
116
+ unsigned long long);
117
+ return __nv_associate_access_property_impl(ptr, property);
118
+ }
119
+
120
+ inline __device__ void __nv_memcpy_async_shared_global_4(void *dst,
121
+ const void *src,
122
+ unsigned src_size) {
123
+ extern __device__ void __nv_memcpy_async_shared_global_4_impl(void *,
124
+ const void *,
125
+ unsigned);
126
+ __nv_memcpy_async_shared_global_4_impl(dst, src, src_size);
127
+ }
128
+
129
+ inline __device__ void __nv_memcpy_async_shared_global_8(void *dst,
130
+ const void *src,
131
+ unsigned src_size) {
132
+ extern __device__ void __nv_memcpy_async_shared_global_8_impl(void *,
133
+ const void *,
134
+ unsigned);
135
+ __nv_memcpy_async_shared_global_8_impl(dst, src, src_size);
136
+ }
137
+
138
+ inline __device__ void __nv_memcpy_async_shared_global_16(void *dst,
139
+ const void *src,
140
+ unsigned src_size) {
141
+ extern __device__ void __nv_memcpy_async_shared_global_16_impl(void *,
142
+ const void *,
143
+ unsigned);
144
+ __nv_memcpy_async_shared_global_16_impl(dst, src, src_size);
145
+ }
146
+
147
+ }
148
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
149
+
150
+ #endif /* __cplusplus && __CUDACC__ */
151
+
152
+ #undef __DEF_IF_HOST
153
+ #undef __SM_80_RT_DECL__
154
+
155
+ #if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
156
+ #include "sm_80_rt.hpp"
157
+ #endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
158
+
159
+ #endif /* !__SM_80_RT_H__ */
160
+
161
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__)
162
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
163
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_H__
164
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_80_rt.hpp ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/sm_80_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/sm_80_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
58
+ #endif
59
+
60
+ #if !defined(__SM_80_RT_HPP__)
61
+ #define __SM_80_RT_HPP__
62
+
63
+ #if defined(__CUDACC_RTC__)
64
+ #define __SM_80_RT_DECL__ __host__ __device__
65
+ #else /* !__CUDACC_RTC__ */
66
+ #define __SM_80_RT_DECL__ static __device__ __inline__
67
+ #endif /* __CUDACC_RTC__ */
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
72
+
73
+ /*******************************************************************************
74
+ * *
75
+ * *
76
+ * *
77
+ *******************************************************************************/
78
+
79
+ #include "builtin_types.h"
80
+ #include "device_types.h"
81
+ #include "host_defines.h"
82
+
83
+ /*******************************************************************************
84
+ * *
85
+ * Below are implementations of SM-8.0 builtin functions which are included as *
86
+ * source (instead of being built in to the compiler) *
87
+ * *
88
+ *******************************************************************************/
89
+
90
+ extern "C" {
91
+ __device_builtin__ __device__ unsigned __reduce_add_sync_unsigned_impl(unsigned, unsigned);
92
+ __device_builtin__ __device__ unsigned __reduce_min_sync_unsigned_impl(unsigned, unsigned);
93
+ __device_builtin__ __device__ unsigned __reduce_max_sync_unsigned_impl(unsigned, unsigned);
94
+ __device_builtin__ __device__ int __reduce_add_sync_signed_impl(unsigned, int);
95
+ __device_builtin__ __device__ int __reduce_min_sync_signed_impl(unsigned, int);
96
+ __device_builtin__ __device__ int __reduce_max_sync_signed_impl(unsigned, int);
97
+ __device_builtin__ __device__ unsigned __reduce_or_sync_unsigned_impl(unsigned, unsigned);
98
+ __device_builtin__ __device__ unsigned __reduce_and_sync_unsigned_impl(unsigned, unsigned);
99
+ __device_builtin__ __device__ unsigned __reduce_xor_sync_unsigned_impl(unsigned, unsigned);
100
+ }
101
+
102
+ __SM_80_RT_DECL__ unsigned __reduce_add_sync(unsigned mask, unsigned value) {
103
+ return __reduce_add_sync_unsigned_impl(mask, value);
104
+ }
105
+
106
+ __SM_80_RT_DECL__ unsigned __reduce_min_sync(unsigned mask, unsigned value) {
107
+ return __reduce_min_sync_unsigned_impl(mask, value);
108
+ }
109
+
110
+ __SM_80_RT_DECL__ unsigned __reduce_max_sync(unsigned mask, unsigned value) {
111
+ return __reduce_max_sync_unsigned_impl(mask, value);
112
+ }
113
+
114
+ __SM_80_RT_DECL__ int __reduce_add_sync(unsigned mask, int value) {
115
+ return __reduce_add_sync_signed_impl(mask, value);
116
+ }
117
+
118
+ __SM_80_RT_DECL__ int __reduce_min_sync(unsigned mask, int value) {
119
+ return __reduce_min_sync_signed_impl(mask, value);
120
+ }
121
+
122
+ __SM_80_RT_DECL__ int __reduce_max_sync(unsigned mask, int value) {
123
+ return __reduce_max_sync_signed_impl(mask, value);
124
+ }
125
+
126
+ __SM_80_RT_DECL__ unsigned __reduce_and_sync(unsigned mask, unsigned value) {
127
+ return __reduce_and_sync_unsigned_impl(mask, value);
128
+ }
129
+
130
+ __SM_80_RT_DECL__ unsigned __reduce_or_sync(unsigned mask, unsigned value) {
131
+ return __reduce_or_sync_unsigned_impl(mask, value);
132
+ }
133
+
134
+ __SM_80_RT_DECL__ unsigned __reduce_xor_sync(unsigned mask, unsigned value) {
135
+ return __reduce_xor_sync_unsigned_impl(mask, value);
136
+ }
137
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 800 */
138
+
139
+ #endif /* __cplusplus && __CUDACC__ */
140
+
141
+ #undef __SM_80_RT_DECL__
142
+
143
+ #endif /* !__SM_80_RT_HPP__ */
144
+
145
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__)
146
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
147
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_80_RT_HPP__
148
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2022-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/sm_90_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/sm_90_rt.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
58
+ #endif
59
+
60
+ #if !defined(__SM_90_RT_H__)
61
+ #define __SM_90_RT_H__
62
+
63
+ #if defined(__CUDACC_RTC__)
64
+ #define __SM_90_RT_DECL__ __host__ __device__
65
+ #else /* !__CUDACC_RTC__ */
66
+ #define __SM_90_RT_DECL__ static __device__ __inline__
67
+ #endif /* __CUDACC_RTC__ */
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
72
+
73
+ /*******************************************************************************
74
+ * *
75
+ * *
76
+ * *
77
+ *******************************************************************************/
78
+
79
+ #include "builtin_types.h"
80
+ #include "device_types.h"
81
+ #include "host_defines.h"
82
+
83
+ #if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
84
+ #define __DEF_IF_HOST { }
85
+ #else /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
86
+ #define __DEF_IF_HOST ;
87
+ #endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
88
+
89
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
90
+ #define EXCLUDE_FROM_RTC
91
+
92
+ __SM_90_RT_DECL__ unsigned __isCtaShared(const void *ptr) __DEF_IF_HOST
93
+ __SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) __DEF_IF_HOST
94
+ __SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, unsigned target_block_rank) __DEF_IF_HOST
95
+ __SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr) __DEF_IF_HOST
96
+ __SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, unsigned cluster_cta_mask) __DEF_IF_HOST
97
+ __SM_90_RT_DECL__ unsigned __clusterDimIsSpecified() __DEF_IF_HOST
98
+ __SM_90_RT_DECL__ dim3 __clusterDim() __DEF_IF_HOST
99
+ __SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx() __DEF_IF_HOST
100
+ __SM_90_RT_DECL__ dim3 __clusterGridDimInClusters() __DEF_IF_HOST
101
+ __SM_90_RT_DECL__ dim3 __clusterIdx() __DEF_IF_HOST
102
+ __SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank() __DEF_IF_HOST
103
+ __SM_90_RT_DECL__ unsigned __clusterSizeInBlocks() __DEF_IF_HOST
104
+ __SM_90_RT_DECL__ void __cluster_barrier_arrive() __DEF_IF_HOST
105
+ __SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed() __DEF_IF_HOST
106
+ __SM_90_RT_DECL__ void __cluster_barrier_wait() __DEF_IF_HOST
107
+ __SM_90_RT_DECL__ void __threadfence_cluster() __DEF_IF_HOST
108
+
109
+ __SM_90_RT_DECL__ float2 atomicAdd(float2 *__address, float2 val) __DEF_IF_HOST
110
+ __SM_90_RT_DECL__ float2 atomicAdd_block(float2 *__address, float2 val) __DEF_IF_HOST
111
+ __SM_90_RT_DECL__ float2 atomicAdd_system(float2 *__address, float2 val) __DEF_IF_HOST
112
+ __SM_90_RT_DECL__ float4 atomicAdd(float4 *__address, float4 val) __DEF_IF_HOST
113
+ __SM_90_RT_DECL__ float4 atomicAdd_block(float4 *__address, float4 val) __DEF_IF_HOST
114
+ __SM_90_RT_DECL__ float4 atomicAdd_system(float4 *__address, float4 val) __DEF_IF_HOST
115
+
116
+ #undef EXCLUDE_FROM_RTC
117
+
118
+ //Note: below atomic functions are templates, so cannot be represented in NVRTC
119
+ //builtins representation, so they have to be parsed on every NVRTC compilation.
120
+ //(notice 'EXCLUDE_FROM_RTC' ends above)
121
+
122
+
123
+ #ifndef __NV_DISABLE_128_ATOMICS
124
+ // lgen definitions for 128b atomics
125
+ extern "C" {
126
+ __device__ __device_builtin__ void __u128AtomicCAS(void *, void *, void *, void *);
127
+ __device__ __device_builtin__ void __u128AtomicCAS_block(void *, void *, void *, void *);
128
+ __device__ __device_builtin__ void __u128AtomicCAS_system(void *, void *, void *, void *);
129
+ __device__ __device_builtin__ void __u128AtomicExch(void *, void *, void *);
130
+ __device__ __device_builtin__ void __u128AtomicExch_block(void *, void *, void *);
131
+ __device__ __device_builtin__ void __u128AtomicExch_system(void *, void *, void *);
132
+ }
133
+
134
+ // macro to get address of object, to workaround situations where the type overloads the "&" operator
135
+ #define __NV_ATOMIC_ADDRESSOF(__val) \
136
+ (void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(__val))))
137
+
138
+ // enable_if
139
+ template<bool __b, typename _T>
140
+ struct __nv_atomic_enable_if { };
141
+
142
+ template<typename _T>
143
+ struct __nv_atomic_enable_if<true, _T> { typedef _T __type; };
144
+
145
+ // alignof
146
+ #if defined(__CUDACC_RTC__)
147
+ #define __NV_ATOMIC_ALIGNOF __alignof__
148
+ #else
149
+ #define __NV_ATOMIC_ALIGNOF __alignof
150
+ #endif
151
+
152
+ // trivially copyable
153
+ template <typename _T>
154
+ struct __nv_atomic_triv_cp_helper {
155
+ #if defined(__GNUC__)
156
+ #if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
157
+ static const bool __val = true;
158
+ #elif (__GNUC__ < 5)
159
+ static const bool __val = __has_trivial_copy(_T);
160
+ #else
161
+ static const bool __val = __is_trivially_copyable(_T);
162
+ #endif
163
+ #else
164
+ static const bool __val = __is_trivially_copyable(_T);
165
+ #endif
166
+ };
167
+ #define __NV_ATOMIC_TRIVIALLY_COPYABLE(_T) \
168
+ __nv_atomic_triv_cp_helper<_T>::__val
169
+
170
+ // return type
171
+ #if __cplusplus >= 202002L // C++20 or greater
172
+ #define __NV_ATOMIC_RET_TYPE(_T) _T
173
+ #else
174
+ #define __NV_ATOMIC_RET_TYPE(_T) typename \
175
+ __nv_atomic_enable_if<sizeof(_T) == 16 && \
176
+ __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
177
+ __NV_ATOMIC_TRIVIALLY_COPYABLE(_T), _T>::__type
178
+ #endif
179
+
180
+ // requires
181
+ #if __cplusplus >= 202002L // C++20 or greater
182
+ #define __NV_ATOMIC_REQUIRES(_T) \
183
+ requires(sizeof(_T) == 16 && \
184
+ __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
185
+ __NV_ATOMIC_TRIVIALLY_COPYABLE(_T))
186
+ #else
187
+ #define __NV_ATOMIC_REQUIRES(_T)
188
+ #endif
189
+
190
+ // temp value and return value
191
+ #if __cplusplus >= 201103L || defined(_MSC_VER) // C++11 or greater, or MSC
192
+ #define __NV_ATOMIC_TEMP(_T) union _U \
193
+ {_T __ret; __device__ __inline__ _U() {}}; _U __u
194
+ #define __NV_ATOMIC_RET(_T) __u.__ret
195
+ #else
196
+ #define __NV_ATOMIC_TEMP(_T) _T __ret
197
+ #define __NV_ATOMIC_RET(_T) __ret
198
+ #endif
199
+
200
+ // templated 128-bit atomics
201
+ template <typename _T>
202
+ __SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
203
+ atomicCAS(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
204
+ __NV_ATOMIC_TEMP(_T);
205
+ __u128AtomicCAS((void *)(__address),
206
+ __NV_ATOMIC_ADDRESSOF(__compare),
207
+ __NV_ATOMIC_ADDRESSOF(__val),
208
+ __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
209
+ return __NV_ATOMIC_RET(_T);
210
+ }
211
+
212
+ template <typename _T>
213
+ __SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
214
+ atomicCAS_block(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
215
+ __NV_ATOMIC_TEMP(_T);
216
+ __u128AtomicCAS_block((void *)(__address),
217
+ __NV_ATOMIC_ADDRESSOF(__compare),
218
+ __NV_ATOMIC_ADDRESSOF(__val),
219
+ __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
220
+ return __NV_ATOMIC_RET(_T);
221
+ }
222
+
223
+ template <typename _T>
224
+ __SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
225
+ atomicCAS_system(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
226
+ __NV_ATOMIC_TEMP(_T);
227
+ __u128AtomicCAS_system((void *)(__address),
228
+ __NV_ATOMIC_ADDRESSOF(__compare),
229
+ __NV_ATOMIC_ADDRESSOF(__val),
230
+ __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
231
+ return __NV_ATOMIC_RET(_T);
232
+ }
233
+
234
+ template <typename _T>
235
+ __SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
236
+ atomicExch(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
237
+ __NV_ATOMIC_TEMP(_T);
238
+ __u128AtomicExch((void *)(__address),
239
+ __NV_ATOMIC_ADDRESSOF(__val),
240
+ __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
241
+ return __NV_ATOMIC_RET(_T);
242
+ }
243
+
244
+ template <typename _T>
245
+ __SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
246
+ atomicExch_block(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
247
+ __NV_ATOMIC_TEMP(_T);
248
+ __u128AtomicExch_block((void *)(__address),
249
+ __NV_ATOMIC_ADDRESSOF(__val),
250
+ __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
251
+ return __NV_ATOMIC_RET(_T);
252
+ }
253
+
254
+ template <typename _T>
255
+ __SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
256
+ atomicExch_system(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
257
+ __NV_ATOMIC_TEMP(_T);
258
+ __u128AtomicExch_system((void *)(__address),
259
+ __NV_ATOMIC_ADDRESSOF(__val),
260
+ __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
261
+ return __NV_ATOMIC_RET(_T);
262
+ }
263
+ #endif /* !__NV_DISABLE_128_ATOMICS */
264
+
265
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
266
+
267
+ #endif /* __cplusplus && __CUDACC__ */
268
+
269
+ #undef __DEF_IF_HOST
270
+ #undef __SM_90_RT_DECL__
271
+
272
+ #if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
273
+ #include "sm_90_rt.hpp"
274
+ #endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
275
+
276
+ #endif /* !__SM_90_RT_H__ */
277
+
278
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__)
279
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
280
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
281
+ #endif
282
+
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("crt/sm_90_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "crt/sm_90_rt.hpp is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
58
+ #endif
59
+
60
+ #if !defined(__SM_90_RT_HPP__)
61
+ #define __SM_90_RT_HPP__
62
+
63
+ #if defined(__CUDACC_RTC__)
64
+ #define __SM_90_RT_DECL__ __host__ __device__
65
+ #else /* !__CUDACC_RTC__ */
66
+ #define __SM_90_RT_DECL__ static __device__ __inline__
67
+ #endif /* __CUDACC_RTC__ */
68
+
69
+ #if defined(__cplusplus) && defined(__CUDACC__)
70
+
71
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
72
+
73
+ /*******************************************************************************
74
+ * *
75
+ * *
76
+ * *
77
+ *******************************************************************************/
78
+
79
+ #include "builtin_types.h"
80
+ #include "device_types.h"
81
+ #include "host_defines.h"
82
+
83
+ /*******************************************************************************
84
+ * *
85
+ * Below are implementations of SM-9.0 builtin functions which are included as *
86
+ * source (instead of being built in to the compiler) *
87
+ * *
88
+ *******************************************************************************/
89
+ extern "C" {
90
+ __device__ unsigned __nv_isClusterShared_impl(const void *);
91
+ __device__ void * __nv_cluster_map_shared_rank_impl(const void *, unsigned);
92
+ __device__ unsigned __nv_cluster_query_shared_rank_impl(const void *);
93
+ __device__ unsigned __nv_clusterDimIsSpecifed_impl();
94
+ __device__ void __nv_clusterDim_impl(unsigned *, unsigned *, unsigned *);
95
+ __device__ void __nv_clusterRelativeBlockIdx_impl(unsigned *,
96
+ unsigned *, unsigned *);
97
+ __device__ void __nv_clusterGridDimInClusters_impl(unsigned *,
98
+ unsigned *, unsigned *);
99
+ __device__ void __nv_clusterIdx_impl(unsigned *, unsigned *, unsigned *);
100
+ __device__ unsigned __nv_clusterRelativeBlockRank_impl();
101
+ __device__ unsigned __nv_clusterSizeInBlocks_impl();
102
+ __device__ void __nv_cluster_barrier_arrive_impl();
103
+ __device__ void __nv_cluster_barrier_arrive_relaxed_impl();
104
+ __device__ void __nv_cluster_barrier_wait_impl();
105
+ __device__ void __nv_threadfence_cluster_impl();
106
+
107
+ __device__ __device_builtin__ float2 __f2AtomicAdd(float2 *, float2);
108
+ __device__ __device_builtin__ float2 __f2AtomicAdd_block(float2 *, float2);
109
+ __device__ __device_builtin__ float2 __f2AtomicAdd_system(float2 *, float2);
110
+ __device__ __device_builtin__ float4 __f4AtomicAdd(float4 *, float4);
111
+ __device__ __device_builtin__ float4 __f4AtomicAdd_block(float4 *, float4);
112
+ __device__ __device_builtin__ float4 __f4AtomicAdd_system(float4 *, float4);
113
+ } // extern "C"
114
+
115
+ __SM_90_RT_DECL__ unsigned __isCtaShared(const void *ptr)
116
+ {
117
+ return __isShared(ptr);
118
+ }
119
+
120
+ __SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr)
121
+ {
122
+ return __nv_isClusterShared_impl(ptr);
123
+ }
124
+
125
+ __SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr,
126
+ unsigned target_block_rank)
127
+ {
128
+ return __nv_cluster_map_shared_rank_impl(ptr, target_block_rank);
129
+ }
130
+
131
+ __SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr)
132
+ {
133
+ return __nv_cluster_query_shared_rank_impl(ptr);
134
+ }
135
+
136
+ __SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr,
137
+ unsigned int cluster_cta_mask)
138
+ {
139
+ return make_uint2((unsigned)__cvta_generic_to_shared(ptr), cluster_cta_mask);
140
+ }
141
+
142
+ __SM_90_RT_DECL__ unsigned __clusterDimIsSpecified()
143
+ {
144
+ return __nv_clusterDimIsSpecifed_impl();
145
+ }
146
+
147
+ __SM_90_RT_DECL__ dim3 __clusterDim()
148
+ {
149
+ unsigned x, y, z;
150
+ __nv_clusterDim_impl(&x, &y, &z);
151
+ return dim3(x,y,z);
152
+ }
153
+
154
+ __SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx()
155
+ {
156
+ unsigned x, y, z;
157
+ __nv_clusterRelativeBlockIdx_impl(&x, &y, &z);
158
+ return dim3(x,y,z);
159
+ }
160
+
161
+ __SM_90_RT_DECL__ dim3 __clusterGridDimInClusters()
162
+ {
163
+ unsigned x, y, z;
164
+ __nv_clusterGridDimInClusters_impl(&x, &y, &z);
165
+ return dim3(x,y,z);
166
+ }
167
+
168
+ __SM_90_RT_DECL__ dim3 __clusterIdx()
169
+ {
170
+ unsigned x, y, z;
171
+ __nv_clusterIdx_impl(&x, &y, &z);
172
+ return dim3(x,y,z);
173
+ }
174
+
175
+ __SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank()
176
+ {
177
+ return __nv_clusterRelativeBlockRank_impl();
178
+ }
179
+
180
+ __SM_90_RT_DECL__ unsigned __clusterSizeInBlocks()
181
+ {
182
+ return __nv_clusterSizeInBlocks_impl();
183
+ }
184
+
185
+ __SM_90_RT_DECL__ void __cluster_barrier_arrive()
186
+ {
187
+ __nv_cluster_barrier_arrive_impl();
188
+ }
189
+
190
+ __SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed()
191
+ {
192
+ __nv_cluster_barrier_arrive_relaxed_impl();
193
+ }
194
+
195
+ __SM_90_RT_DECL__ void __cluster_barrier_wait()
196
+ {
197
+ __nv_cluster_barrier_wait_impl();
198
+ }
199
+
200
+ __SM_90_RT_DECL__ void __threadfence_cluster()
201
+ {
202
+ __nv_threadfence_cluster_impl();
203
+ }
204
+
205
+
206
+ /* Define __PTR for atomicAdd prototypes below, undef after done */
207
+ #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
208
+ #define __PTR "l"
209
+ #else
210
+ #define __PTR "r"
211
+ #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
212
+
213
+ __SM_90_RT_DECL__ float2 atomicAdd(float2 *address, float2 val) {
214
+ return __f2AtomicAdd(address, val);
215
+ }
216
+
217
+ __SM_90_RT_DECL__ float2 atomicAdd_block(float2 *address, float2 val) {
218
+ return __f2AtomicAdd_block(address, val);
219
+ }
220
+
221
+ __SM_90_RT_DECL__ float2 atomicAdd_system(float2 *address, float2 val) {
222
+ return __f2AtomicAdd_system(address, val);
223
+ }
224
+
225
+ __SM_90_RT_DECL__ float4 atomicAdd(float4 *address, float4 val) {
226
+ return __f4AtomicAdd(address, val);
227
+ }
228
+
229
+ __SM_90_RT_DECL__ float4 atomicAdd_block(float4 *address, float4 val) {
230
+ return __f4AtomicAdd_block(address, val);
231
+ }
232
+
233
+ __SM_90_RT_DECL__ float4 atomicAdd_system(float4 *address, float4 val) {
234
+ return __f4AtomicAdd_system(address, val);
235
+ }
236
+
237
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
238
+
239
+ #endif /* __cplusplus && __CUDACC__ */
240
+
241
+ #undef __SM_90_RT_DECL__
242
+
243
+ #endif /* !__SM_90_RT_HPP__ */
244
+
245
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__)
246
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
247
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
248
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/crt/storage_class.h ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * NVIDIA_COPYRIGHT_BEGIN
3
+ *
4
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION. All rights reserved.
5
+ *
6
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
7
+ * and proprietary rights in and to this software, related documentation
8
+ * and any modifications thereto. Any use, reproduction, disclosure or
9
+ * distribution of this software and related documentation without an express
10
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
11
+ *
12
+ * NVIDIA_COPYRIGHT_END
13
+ */
14
+
15
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
16
+ #if defined(_MSC_VER)
17
+ #pragma message("crt/storage_class.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
18
+ #else
19
+ #warning "crt/storage_class.h is an internal header file and must not be used directly. Please use cuda_runtime_api.h or cuda_runtime.h instead."
20
+ #endif
21
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
22
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
23
+ #endif
24
+
25
+ #if !defined(__STORAGE_CLASS_H__)
26
+ #define __STORAGE_CLASS_H__
27
+
28
+ #if !defined(__var_used__)
29
+
30
+ #define __var_used__
31
+
32
+ #endif /* __var_used__ */
33
+
34
+ #if !defined(__loc_sc__)
35
+
36
+ #define __loc_sc__(loc, size, sc) \
37
+ __storage##_##sc##size##loc loc
38
+
39
+ #endif /* !__loc_sc__ */
40
+
41
+ #if !defined(__storage___device__)
42
+ #define __storage___device__ static __var_used__
43
+ #endif /* __storage___device__ */
44
+
45
+ #if !defined(__storage_extern__device__)
46
+ #define __storage_extern__device__ static __var_used__
47
+ #endif /* __storage_extern__device__ */
48
+
49
+ #if !defined(__storage_auto__device__)
50
+ #define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
51
+ #endif /* __storage_auto__device__ */
52
+
53
+ #if !defined(__storage_static__device__)
54
+ #define __storage_static__device__ static __var_used__
55
+ #endif /* __storage_static__device__ */
56
+
57
+ #if !defined(__storage___constant__)
58
+ #define __storage___constant__ static __var_used__
59
+ #endif /* __storage___constant__ */
60
+
61
+ #if !defined(__storage_extern__constant__)
62
+ #define __storage_extern__constant__ static __var_used__
63
+ #endif /* __storage_extern__constant__ */
64
+
65
+ #if !defined(__storage_auto__constant__)
66
+ #define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
67
+ #endif /* __storage_auto__constant__ */
68
+
69
+ #if !defined(__storage_static__constant__)
70
+ #define __storage_static__constant__ static __var_used__
71
+ #endif /* __storage_static__constant__ */
72
+
73
+ #if !defined(__storage___shared__)
74
+ #define __storage___shared__ static __var_used__
75
+ #endif /* __storage___shared__ */
76
+
77
+ #if !defined(__storage_extern__shared__)
78
+ #define __storage_extern__shared__ static __var_used__
79
+ #endif /* __storage_extern__shared__ */
80
+
81
+ #if !defined(__storage_auto__shared__)
82
+ #define __storage_auto__shared__ static
83
+ #endif /* __storage_auto__shared__ */
84
+
85
+ #if !defined(__storage_static__shared__)
86
+ #define __storage_static__shared__ static __var_used__
87
+ #endif /* __storage_static__shared__ */
88
+
89
+ #if !defined(__storage__unsized__shared__)
90
+ #define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
91
+ #endif /* __storage__unsized__shared__ */
92
+
93
+ #if !defined(__storage_extern_unsized__shared__)
94
+ #define __storage_extern_unsized__shared__ static __var_used__
95
+ #endif /* __storage_extern_unsized__shared__ */
96
+
97
+ #if !defined(__storage_auto_unsized__shared__)
98
+ #define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
99
+ #endif /* __storage_auto_unsized__shared__ */
100
+
101
+ #if !defined(__storage_static_unsized__shared__)
102
+ #define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
103
+ #endif /* __storage_static_unsized__shared__ */
104
+
105
+ #if !defined(__storage___text__)
106
+ #define __storage___text__ static __var_used__
107
+ #endif /* __storage___text__ */
108
+
109
+ #if !defined(__storage_extern__text__)
110
+ #define __storage_extern__text__ static __var_used__
111
+ #endif /* __storage_extern__text__ */
112
+
113
+ #if !defined(__storage_auto__text__)
114
+ #define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
115
+ #endif /* __storage_auto__text__ */
116
+
117
+ #if !defined(__storage_static__text__)
118
+ #define __storage_static__text__ static __var_used__
119
+ #endif /* __storage_static__text__ */
120
+
121
+ #if !defined(__storage___surf__)
122
+ #define __storage___surf__ static __var_used__
123
+ #endif /* __storage___surf__ */
124
+
125
+ #if !defined(__storage_extern__surf__)
126
+ #define __storage_extern__surf__ static __var_used__
127
+ #endif /* __storage_extern__surf__ */
128
+
129
+ #if !defined(__storage_auto__surf__)
130
+ #define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
131
+ #endif /* __storage_auto__surf__ */
132
+
133
+ #if !defined(__storage_static__surf__)
134
+ #define __storage_static__surf__ static __var_used__
135
+ #endif /* __storage_static__surf__ */
136
+
137
+ #endif /* !__STORAGE_CLASS_H__ */
138
+
139
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__)
140
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
141
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
142
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGL.h ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAGL_H
51
+ #define CUDAGL_H
52
+
53
+ #include <cuda.h>
54
+ #include <GL/gl.h>
55
+
56
+ #if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
57
+ #define __CUDA_DEPRECATED
58
+ #elif defined(_MSC_VER)
59
+ #define __CUDA_DEPRECATED __declspec(deprecated)
60
+ #elif defined(__GNUC__)
61
+ #define __CUDA_DEPRECATED __attribute__((deprecated))
62
+ #else
63
+ #define __CUDA_DEPRECATED
64
+ #endif
65
+
66
+ #ifdef CUDA_FORCE_API_VERSION
67
+ #error "CUDA_FORCE_API_VERSION is no longer supported."
68
+ #endif
69
+
70
+ #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
71
+ #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
72
+ #define __CUDA_API_PTDS(api) api ## _ptds
73
+ #define __CUDA_API_PTSZ(api) api ## _ptsz
74
+ #else
75
+ #define __CUDA_API_PTDS(api) api
76
+ #define __CUDA_API_PTSZ(api) api
77
+ #endif
78
+
79
+ #define cuGLCtxCreate cuGLCtxCreate_v2
80
+ #define cuGLMapBufferObject __CUDA_API_PTDS(cuGLMapBufferObject_v2)
81
+ #define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
82
+ #define cuGLGetDevices cuGLGetDevices_v2
83
+
84
+ #ifdef __cplusplus
85
+ extern "C" {
86
+ #endif
87
+
88
+ /**
89
+ * \file cudaGL.h
90
+ * \brief Header file for the OpenGL interoperability functions of the
91
+ * low-level CUDA driver application programming interface.
92
+ */
93
+
94
+ /**
95
+ * \defgroup CUDA_GL OpenGL Interoperability
96
+ * \ingroup CUDA_DRIVER
97
+ *
98
+ * ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
99
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
100
+ *
101
+ * This section describes the OpenGL interoperability functions of the
102
+ * low-level CUDA driver application programming interface. Note that mapping
103
+ * of OpenGL resources is performed with the graphics API agnostic, resource
104
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
105
+ *
106
+ * @{
107
+ */
108
+
109
+ #if defined(_WIN32)
110
+ #if !defined(WGL_NV_gpu_affinity)
111
+ typedef void* HGPUNV;
112
+ #endif
113
+ #endif /* _WIN32 */
114
+
115
+ /**
116
+ * \brief Registers an OpenGL buffer object
117
+ *
118
+ * Registers the buffer object specified by \p buffer for access by
119
+ * CUDA. A handle to the registered object is returned as \p
120
+ * pCudaResource. The register flags \p Flags specify the intended usage,
121
+ * as follows:
122
+ *
123
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
124
+ * resource will be used. It is therefore assumed that this resource will be
125
+ * read from and written to by CUDA. This is the default value.
126
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
127
+ * will not write to this resource.
128
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
129
+ * CUDA will not read from this resource and will write over the
130
+ * entire contents of the resource, so none of the data previously
131
+ * stored in the resource will be preserved.
132
+ *
133
+ * \param pCudaResource - Pointer to the returned object handle
134
+ * \param buffer - name of buffer object to be registered
135
+ * \param Flags - Register flags
136
+ *
137
+ * \return
138
+ * ::CUDA_SUCCESS,
139
+ * ::CUDA_ERROR_INVALID_HANDLE,
140
+ * ::CUDA_ERROR_ALREADY_MAPPED,
141
+ * ::CUDA_ERROR_INVALID_CONTEXT,
142
+ * ::CUDA_ERROR_OPERATING_SYSTEM
143
+ * \notefnerr
144
+ *
145
+ * \sa
146
+ * ::cuGraphicsUnregisterResource,
147
+ * ::cuGraphicsMapResources,
148
+ * ::cuGraphicsResourceGetMappedPointer,
149
+ * ::cudaGraphicsGLRegisterBuffer
150
+ */
151
+ CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
152
+
153
+ /**
154
+ * \brief Register an OpenGL texture or renderbuffer object
155
+ *
156
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
157
+ * A handle to the registered object is returned as \p pCudaResource.
158
+ *
159
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
160
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
161
+ * or ::GL_RENDERBUFFER.
162
+ *
163
+ * The register flags \p Flags specify the intended usage, as follows:
164
+ *
165
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
166
+ * resource will be used. It is therefore assumed that this resource will be
167
+ * read from and written to by CUDA. This is the default value.
168
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
169
+ * will not write to this resource.
170
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
171
+ * CUDA will not read from this resource and will write over the
172
+ * entire contents of the resource, so none of the data previously
173
+ * stored in the resource will be preserved.
174
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
175
+ * bind this resource to a surface reference.
176
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
177
+ * texture gather operations on this resource.
178
+ *
179
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
180
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
181
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
182
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
183
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
184
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
185
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
186
+ *
187
+ * The following image classes are currently disallowed:
188
+ * - Textures with borders
189
+ * - Multisampled renderbuffers
190
+ *
191
+ * \param pCudaResource - Pointer to the returned object handle
192
+ * \param image - name of texture or renderbuffer object to be registered
193
+ * \param target - Identifies the type of object specified by \p image
194
+ * \param Flags - Register flags
195
+ *
196
+ * \return
197
+ * ::CUDA_SUCCESS,
198
+ * ::CUDA_ERROR_INVALID_HANDLE,
199
+ * ::CUDA_ERROR_ALREADY_MAPPED,
200
+ * ::CUDA_ERROR_INVALID_CONTEXT,
201
+ * ::CUDA_ERROR_OPERATING_SYSTEM
202
+ * \notefnerr
203
+ *
204
+ * \sa
205
+ * ::cuGraphicsUnregisterResource,
206
+ * ::cuGraphicsMapResources,
207
+ * ::cuGraphicsSubResourceGetMappedArray,
208
+ * ::cudaGraphicsGLRegisterImage
209
+ */
210
+ CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
211
+
212
+ #ifdef _WIN32
213
+ /**
214
+ * \brief Gets the CUDA device associated with hGpu
215
+ *
216
+ * Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
217
+ * applicable.
218
+ *
219
+ * \param pDevice - Device associated with hGpu
220
+ * \param hGpu - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
221
+ *
222
+ * \return
223
+ * ::CUDA_SUCCESS,
224
+ * ::CUDA_ERROR_DEINITIALIZED,
225
+ * ::CUDA_ERROR_NOT_INITIALIZED,
226
+ * ::CUDA_ERROR_INVALID_CONTEXT,
227
+ * ::CUDA_ERROR_INVALID_VALUE
228
+ * \notefnerr
229
+ *
230
+ * \sa ::cuGLMapBufferObject,
231
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
232
+ * ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
233
+ * ::cuGLSetBufferObjectMapFlags,
234
+ * ::cudaWGLGetDevice
235
+ */
236
+ CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
237
+ #endif /* _WIN32 */
238
+
239
+ /**
240
+ * CUDA devices corresponding to an OpenGL device
241
+ */
242
+ typedef enum CUGLDeviceList_enum {
243
+ CU_GL_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
244
+ CU_GL_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
245
+ CU_GL_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
246
+ } CUGLDeviceList;
247
+
248
+ /**
249
+ * \brief Gets the CUDA devices associated with the current OpenGL context
250
+ *
251
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
252
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
253
+ * at most cudaDeviceCount of the CUDA-compatible devices corresponding to
254
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
255
+ * context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
256
+ *
257
+ * The \p deviceList argument may be any of the following:
258
+ * - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
259
+ * - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
260
+ * render the current frame (in SLI).
261
+ * - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
262
+ * render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
263
+ * this is correct in all cases.
264
+ *
265
+ * \param pCudaDeviceCount - Returned number of CUDA devices.
266
+ * \param pCudaDevices - Returned CUDA devices.
267
+ * \param cudaDeviceCount - The size of the output device array pCudaDevices.
268
+ * \param deviceList - The set of devices to return.
269
+ *
270
+ * \return
271
+ * ::CUDA_SUCCESS,
272
+ * ::CUDA_ERROR_NO_DEVICE,
273
+ * ::CUDA_ERROR_INVALID_VALUE,
274
+ * ::CUDA_ERROR_INVALID_CONTEXT,
275
+ * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
276
+ * ::CUDA_ERROR_OPERATING_SYSTEM
277
+ *
278
+ * \notefnerr
279
+ *
280
+ * \sa
281
+ * ::cuWGLGetDevice,
282
+ * ::cudaGLGetDevices
283
+ */
284
+ CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
285
+
286
+ /**
287
+ * \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
288
+ *
289
+ * ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
290
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
291
+ *
292
+ * This section describes deprecated OpenGL interoperability functionality.
293
+ *
294
+ * @{
295
+ */
296
+
297
+ /** Flags to map or unmap a resource */
298
+ typedef enum CUGLmap_flags_enum {
299
+ CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
300
+ CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
301
+ CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
302
+ } CUGLmap_flags;
303
+
304
+ /**
305
+ * \brief Create a CUDA context for interoperability with OpenGL
306
+ *
307
+ * \deprecated This function is deprecated as of Cuda 5.0.
308
+ *
309
+ * This function is deprecated and should no longer be used. It is
310
+ * no longer necessary to associate a CUDA context with an OpenGL
311
+ * context in order to achieve maximum interoperability performance.
312
+ *
313
+ * \param pCtx - Returned CUDA context
314
+ * \param Flags - Options for CUDA context creation
315
+ * \param device - Device on which to create the context
316
+ *
317
+ * \return
318
+ * ::CUDA_SUCCESS,
319
+ * ::CUDA_ERROR_DEINITIALIZED,
320
+ * ::CUDA_ERROR_NOT_INITIALIZED,
321
+ * ::CUDA_ERROR_INVALID_CONTEXT,
322
+ * ::CUDA_ERROR_INVALID_VALUE,
323
+ * ::CUDA_ERROR_OUT_OF_MEMORY
324
+ * \notefnerr
325
+ *
326
+ * \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
327
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
328
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
329
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
330
+ * ::cuWGLGetDevice
331
+ */
332
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
333
+
334
+ /**
335
+ * \brief Initializes OpenGL interoperability
336
+ *
337
+ * \deprecated This function is deprecated as of Cuda 3.0.
338
+ *
339
+ * Initializes OpenGL interoperability. This function is deprecated
340
+ * and calling it is no longer required. It may fail if the needed
341
+ * OpenGL driver facilities are not available.
342
+ *
343
+ * \return
344
+ * ::CUDA_SUCCESS,
345
+ * ::CUDA_ERROR_DEINITIALIZED,
346
+ * ::CUDA_ERROR_NOT_INITIALIZED,
347
+ * ::CUDA_ERROR_INVALID_CONTEXT,
348
+ * ::CUDA_ERROR_UNKNOWN
349
+ * \notefnerr
350
+ *
351
+ * \sa ::cuGLMapBufferObject,
352
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
353
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
354
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
355
+ * ::cuWGLGetDevice
356
+ */
357
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
358
+
359
+ /**
360
+ * \brief Registers an OpenGL buffer object
361
+ *
362
+ * \deprecated This function is deprecated as of Cuda 3.0.
363
+ *
364
+ * Registers the buffer object specified by \p buffer for access by
365
+ * CUDA. This function must be called before CUDA can map the buffer
366
+ * object. There must be a valid OpenGL context bound to the current
367
+ * thread when this function is called, and the buffer name is
368
+ * resolved by that context.
369
+ *
370
+ * \param buffer - The name of the buffer object to register.
371
+ *
372
+ * \return
373
+ * ::CUDA_SUCCESS,
374
+ * ::CUDA_ERROR_DEINITIALIZED,
375
+ * ::CUDA_ERROR_NOT_INITIALIZED,
376
+ * ::CUDA_ERROR_INVALID_CONTEXT,
377
+ * ::CUDA_ERROR_ALREADY_MAPPED
378
+ * \notefnerr
379
+ *
380
+ * \sa ::cuGraphicsGLRegisterBuffer
381
+ */
382
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
383
+
384
+ /**
385
+ * \brief Maps an OpenGL buffer object
386
+ *
387
+ * \deprecated This function is deprecated as of Cuda 3.0.
388
+ *
389
+ * Maps the buffer object specified by \p buffer into the address space of the
390
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
391
+ * and size of the resulting mapping.
392
+ *
393
+ * There must be a valid OpenGL context bound to the current thread
394
+ * when this function is called. This must be the same context, or a
395
+ * member of the same shareGroup, as the context that was bound when
396
+ * the buffer was registered.
397
+ *
398
+ * All streams in the current CUDA context are synchronized with the
399
+ * current GL context.
400
+ *
401
+ * \param dptr - Returned mapped base pointer
402
+ * \param size - Returned size of mapping
403
+ * \param buffer - The name of the buffer object to map
404
+ *
405
+ * \return
406
+ * ::CUDA_SUCCESS,
407
+ * ::CUDA_ERROR_DEINITIALIZED,
408
+ * ::CUDA_ERROR_NOT_INITIALIZED,
409
+ * ::CUDA_ERROR_INVALID_CONTEXT,
410
+ * ::CUDA_ERROR_INVALID_VALUE,
411
+ * ::CUDA_ERROR_MAP_FAILED
412
+ * \notefnerr
413
+ *
414
+ * \sa ::cuGraphicsMapResources
415
+ */
416
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size, GLuint buffer);
417
+
418
+ /**
419
+ * \brief Unmaps an OpenGL buffer object
420
+ *
421
+ * \deprecated This function is deprecated as of Cuda 3.0.
422
+ *
423
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
424
+ *
425
+ * There must be a valid OpenGL context bound to the current thread
426
+ * when this function is called. This must be the same context, or a
427
+ * member of the same shareGroup, as the context that was bound when
428
+ * the buffer was registered.
429
+ *
430
+ * All streams in the current CUDA context are synchronized with the
431
+ * current GL context.
432
+ *
433
+ * \param buffer - Buffer object to unmap
434
+ *
435
+ * \return
436
+ * ::CUDA_SUCCESS,
437
+ * ::CUDA_ERROR_DEINITIALIZED,
438
+ * ::CUDA_ERROR_NOT_INITIALIZED,
439
+ * ::CUDA_ERROR_INVALID_CONTEXT,
440
+ * ::CUDA_ERROR_INVALID_VALUE
441
+ * \notefnerr
442
+ *
443
+ * \sa ::cuGraphicsUnmapResources
444
+ */
445
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
446
+
447
+ /**
448
+ * \brief Unregister an OpenGL buffer object
449
+ *
450
+ * \deprecated This function is deprecated as of Cuda 3.0.
451
+ *
452
+ * Unregisters the buffer object specified by \p buffer. This
453
+ * releases any resources associated with the registered buffer.
454
+ * After this call, the buffer may no longer be mapped for access by
455
+ * CUDA.
456
+ *
457
+ * There must be a valid OpenGL context bound to the current thread
458
+ * when this function is called. This must be the same context, or a
459
+ * member of the same shareGroup, as the context that was bound when
460
+ * the buffer was registered.
461
+ *
462
+ * \param buffer - Name of the buffer object to unregister
463
+ *
464
+ * \return
465
+ * ::CUDA_SUCCESS,
466
+ * ::CUDA_ERROR_DEINITIALIZED,
467
+ * ::CUDA_ERROR_NOT_INITIALIZED,
468
+ * ::CUDA_ERROR_INVALID_CONTEXT,
469
+ * ::CUDA_ERROR_INVALID_VALUE
470
+ * \notefnerr
471
+ *
472
+ * \sa ::cuGraphicsUnregisterResource
473
+ */
474
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
475
+
476
+ /**
477
+ * \brief Set the map flags for an OpenGL buffer object
478
+ *
479
+ * \deprecated This function is deprecated as of Cuda 3.0.
480
+ *
481
+ * Sets the map flags for the buffer object specified by \p buffer.
482
+ *
483
+ * Changes to \p Flags will take effect the next time \p buffer is mapped.
484
+ * The \p Flags argument may be any of the following:
485
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
486
+ * resource will be used. It is therefore assumed that this resource will be
487
+ * read from and written to by CUDA kernels. This is the default value.
488
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
489
+ * access this resource will not write to this resource.
490
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
491
+ * which access this resource will not read from this resource and will
492
+ * write over the entire contents of the resource, so none of the data
493
+ * previously stored in the resource will be preserved.
494
+ *
495
+ * If \p buffer has not been registered for use with CUDA, then
496
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
497
+ * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
498
+ *
499
+ * There must be a valid OpenGL context bound to the current thread
500
+ * when this function is called. This must be the same context, or a
501
+ * member of the same shareGroup, as the context that was bound when
502
+ * the buffer was registered.
503
+ *
504
+ * \param buffer - Buffer object to unmap
505
+ * \param Flags - Map flags
506
+ *
507
+ * \return
508
+ * ::CUDA_SUCCESS,
509
+ * ::CUDA_ERROR_NOT_INITIALIZED,
510
+ * ::CUDA_ERROR_INVALID_HANDLE,
511
+ * ::CUDA_ERROR_ALREADY_MAPPED,
512
+ * ::CUDA_ERROR_INVALID_CONTEXT,
513
+ * \notefnerr
514
+ *
515
+ * \sa ::cuGraphicsResourceSetMapFlags
516
+ */
517
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
518
+
519
+ /**
520
+ * \brief Maps an OpenGL buffer object
521
+ *
522
+ * \deprecated This function is deprecated as of Cuda 3.0.
523
+ *
524
+ * Maps the buffer object specified by \p buffer into the address space of the
525
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
526
+ * and size of the resulting mapping.
527
+ *
528
+ * There must be a valid OpenGL context bound to the current thread
529
+ * when this function is called. This must be the same context, or a
530
+ * member of the same shareGroup, as the context that was bound when
531
+ * the buffer was registered.
532
+ *
533
+ * Stream \p hStream in the current CUDA context is synchronized with
534
+ * the current GL context.
535
+ *
536
+ * \param dptr - Returned mapped base pointer
537
+ * \param size - Returned size of mapping
538
+ * \param buffer - The name of the buffer object to map
539
+ * \param hStream - Stream to synchronize
540
+ *
541
+ * \return
542
+ * ::CUDA_SUCCESS,
543
+ * ::CUDA_ERROR_DEINITIALIZED,
544
+ * ::CUDA_ERROR_NOT_INITIALIZED,
545
+ * ::CUDA_ERROR_INVALID_CONTEXT,
546
+ * ::CUDA_ERROR_INVALID_VALUE,
547
+ * ::CUDA_ERROR_MAP_FAILED
548
+ * \notefnerr
549
+ *
550
+ * \sa ::cuGraphicsMapResources
551
+ */
552
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
553
+
554
+ /**
555
+ * \brief Unmaps an OpenGL buffer object
556
+ *
557
+ * \deprecated This function is deprecated as of Cuda 3.0.
558
+ *
559
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
560
+ *
561
+ * There must be a valid OpenGL context bound to the current thread
562
+ * when this function is called. This must be the same context, or a
563
+ * member of the same shareGroup, as the context that was bound when
564
+ * the buffer was registered.
565
+ *
566
+ * Stream \p hStream in the current CUDA context is synchronized with
567
+ * the current GL context.
568
+ *
569
+ * \param buffer - Name of the buffer object to unmap
570
+ * \param hStream - Stream to synchronize
571
+ *
572
+ * \return
573
+ * ::CUDA_SUCCESS,
574
+ * ::CUDA_ERROR_DEINITIALIZED,
575
+ * ::CUDA_ERROR_NOT_INITIALIZED,
576
+ * ::CUDA_ERROR_INVALID_CONTEXT,
577
+ * ::CUDA_ERROR_INVALID_VALUE
578
+ * \notefnerr
579
+ *
580
+ * \sa ::cuGraphicsUnmapResources
581
+ */
582
+ __CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
583
+
584
+ /** @} */ /* END CUDA_GL_DEPRECATED */
585
+ /** @} */ /* END CUDA_GL */
586
+
587
+
588
+ #if defined(__CUDA_API_VERSION_INTERNAL)
589
+ #undef cuGLCtxCreate
590
+ #undef cuGLMapBufferObject
591
+ #undef cuGLMapBufferObjectAsync
592
+ #undef cuGLGetDevices
593
+
594
+ CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
595
+ CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer);
596
+ CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream);
597
+ CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
598
+ CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
599
+ CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
600
+ #endif /* __CUDA_API_VERSION_INTERNAL */
601
+
602
+ #ifdef __cplusplus
603
+ };
604
+ #endif
605
+
606
+ #undef __CUDA_DEPRECATED
607
+
608
+ #endif
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAGLTYPEDEFS_H
51
+ #define CUDAGLTYPEDEFS_H
52
+
53
+ // Dependent includes for cudagl.h
54
+ #include <GL/gl.h>
55
+
56
+ #include <cudaGL.h>
57
+
58
+ #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
59
+ #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
60
+ #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
61
+ #else
62
+ #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
63
+ #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
64
+ #endif
65
+
66
+ #ifdef __cplusplus
67
+ extern "C" {
68
+ #endif // __cplusplus
69
+
70
+ /*
71
+ * Macros for the latest version for each driver function in cudaGL.h
72
+ */
73
+ #define PFN_cuGraphicsGLRegisterBuffer PFN_cuGraphicsGLRegisterBuffer_v3000
74
+ #define PFN_cuGraphicsGLRegisterImage PFN_cuGraphicsGLRegisterImage_v3000
75
+ #define PFN_cuWGLGetDevice PFN_cuWGLGetDevice_v2020
76
+ #define PFN_cuGLGetDevices PFN_cuGLGetDevices_v6050
77
+ #define PFN_cuGLCtxCreate PFN_cuGLCtxCreate_v3020
78
+ #define PFN_cuGLInit PFN_cuGLInit_v2000
79
+ #define PFN_cuGLRegisterBufferObject PFN_cuGLRegisterBufferObject_v2000
80
+ #define PFN_cuGLMapBufferObject __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
81
+ #define PFN_cuGLUnmapBufferObject PFN_cuGLUnmapBufferObject_v2000
82
+ #define PFN_cuGLUnregisterBufferObject PFN_cuGLUnregisterBufferObject_v2000
83
+ #define PFN_cuGLSetBufferObjectMapFlags PFN_cuGLSetBufferObjectMapFlags_v2030
84
+ #define PFN_cuGLMapBufferObjectAsync __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
85
+ #define PFN_cuGLUnmapBufferObjectAsync PFN_cuGLUnmapBufferObjectAsync_v2030
86
+
87
+
88
+ /**
89
+ * Type definitions for functions defined in cudaGL.h
90
+ */
91
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
92
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
93
+ #ifdef _WIN32
94
+ typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
95
+ #endif
96
+ typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
97
+ typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
98
+ typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
99
+ typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
100
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
101
+ typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
102
+ typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
103
+ typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
104
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
105
+ typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
106
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
107
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
108
+
109
+ /*
110
+ * Type definitions for older versioned functions in cuda.h
111
+ */
112
+ #if defined(__CUDA_API_VERSION_INTERNAL)
113
+ typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
114
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
115
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
116
+ typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
117
+ #endif
118
+
119
+ #ifdef __cplusplus
120
+ }
121
+ #endif // __cplusplus
122
+
123
+ #endif // file guard
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cudaTypedefs.h ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_GL_INTEROP_H__)
51
+ #define __CUDA_GL_INTEROP_H__
52
+
53
+ #include "cuda_runtime_api.h"
54
+
55
+ #if defined(__APPLE__)
56
+
57
+ #include <OpenGL/gl.h>
58
+
59
+ #else /* __APPLE__ */
60
+
61
+ #if defined(__arm__) || defined(__aarch64__)
62
+ #ifndef GL_VERSION
63
+ #error Please include the appropriate gl headers before including cuda_gl_interop.h
64
+ #endif
65
+ #else
66
+ #include <GL/gl.h>
67
+ #endif
68
+
69
+ #endif /* __APPLE__ */
70
+
71
+ /** \cond impl_private */
72
+ #if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
73
+ #define __CUDA_DEPRECATED
74
+ #elif defined(_MSC_VER)
75
+ #define __CUDA_DEPRECATED __declspec(deprecated)
76
+ #elif defined(__GNUC__)
77
+ #define __CUDA_DEPRECATED __attribute__((deprecated))
78
+ #else
79
+ #define __CUDA_DEPRECATED
80
+ #endif
81
+ /** \endcond impl_private */
82
+
83
+ #if defined(__cplusplus)
84
+ extern "C" {
85
+ #endif /* __cplusplus */
86
+
87
+ /**
88
+ * \addtogroup CUDART_OPENGL OpenGL Interoperability
89
+ * This section describes the OpenGL interoperability functions of the CUDA
90
+ * runtime application programming interface. Note that mapping of OpenGL
91
+ * resources is performed with the graphics API agnostic, resource mapping
92
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
93
+ *
94
+ * @{
95
+ */
96
+
97
+ /**
98
+ * CUDA devices corresponding to the current OpenGL context
99
+ */
100
+ enum cudaGLDeviceList
101
+ {
102
+ cudaGLDeviceListAll = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
103
+ cudaGLDeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
104
+ cudaGLDeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
105
+ };
106
+
107
+ /**
108
+ * \brief Gets the CUDA devices associated with the current OpenGL context
109
+ *
110
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
111
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
112
+ * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to
113
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
114
+ * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
115
+ *
116
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the
117
+ * current OpenGL context
118
+ * \param pCudaDevices - Returned CUDA devices corresponding to the current
119
+ * OpenGL context
120
+ * \param cudaDeviceCount - The size of the output device array \p pCudaDevices
121
+ * \param deviceList - The set of devices to return. This set may be
122
+ * ::cudaGLDeviceListAll for all devices,
123
+ * ::cudaGLDeviceListCurrentFrame for the devices used to
124
+ * render the current frame (in SLI), or
125
+ * ::cudaGLDeviceListNextFrame for the devices used to
126
+ * render the next frame (in SLI).
127
+ *
128
+ * \return
129
+ * ::cudaSuccess,
130
+ * ::cudaErrorNoDevice,
131
+ * ::cudaErrorInvalidGraphicsContext,
132
+ * ::cudaErrorOperatingSystem,
133
+ * ::cudaErrorUnknown
134
+ *
135
+ * \note This function is not supported on Mac OS X.
136
+ * \notefnerr
137
+ *
138
+ * \sa
139
+ * ::cudaGraphicsUnregisterResource,
140
+ * ::cudaGraphicsMapResources,
141
+ * ::cudaGraphicsSubResourceGetMappedArray,
142
+ * ::cudaGraphicsResourceGetMappedPointer,
143
+ * ::cuGLGetDevices
144
+ */
145
+ extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
146
+
147
+ /**
148
+ * \brief Register an OpenGL texture or renderbuffer object
149
+ *
150
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
151
+ * A handle to the registered object is returned as \p resource.
152
+ *
153
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
154
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
155
+ * or ::GL_RENDERBUFFER.
156
+ *
157
+ * The register flags \p flags specify the intended usage, as follows:
158
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
159
+ * resource will be used. It is therefore assumed that this resource will be
160
+ * read from and written to by CUDA. This is the default value.
161
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
162
+ * will not write to this resource.
163
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
164
+ * CUDA will not read from this resource and will write over the
165
+ * entire contents of the resource, so none of the data previously
166
+ * stored in the resource will be preserved.
167
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
168
+ * bind this resource to a surface reference.
169
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
170
+ * texture gather operations on this resource.
171
+ *
172
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
173
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
174
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
175
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
176
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
177
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
178
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
179
+ *
180
+ * The following image classes are currently disallowed:
181
+ * - Textures with borders
182
+ * - Multisampled renderbuffers
183
+ *
184
+ * \param resource - Pointer to the returned object handle
185
+ * \param image - name of texture or renderbuffer object to be registered
186
+ * \param target - Identifies the type of object specified by \p image
187
+ * \param flags - Register flags
188
+ *
189
+ * \return
190
+ * ::cudaSuccess,
191
+ * ::cudaErrorInvalidDevice,
192
+ * ::cudaErrorInvalidValue,
193
+ * ::cudaErrorInvalidResourceHandle,
194
+ * ::cudaErrorOperatingSystem,
195
+ * ::cudaErrorUnknown
196
+ * \notefnerr
197
+ *
198
+ * \sa
199
+ * ::cudaGraphicsUnregisterResource,
200
+ * ::cudaGraphicsMapResources,
201
+ * ::cudaGraphicsSubResourceGetMappedArray,
202
+ * ::cuGraphicsGLRegisterImage
203
+ */
204
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
205
+
206
+ /**
207
+ * \brief Registers an OpenGL buffer object
208
+ *
209
+ * Registers the buffer object specified by \p buffer for access by
210
+ * CUDA. A handle to the registered object is returned as \p
211
+ * resource. The register flags \p flags specify the intended usage,
212
+ * as follows:
213
+ *
214
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
215
+ * resource will be used. It is therefore assumed that this resource will be
216
+ * read from and written to by CUDA. This is the default value.
217
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
218
+ * will not write to this resource.
219
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
220
+ * CUDA will not read from this resource and will write over the
221
+ * entire contents of the resource, so none of the data previously
222
+ * stored in the resource will be preserved.
223
+ *
224
+ * \param resource - Pointer to the returned object handle
225
+ * \param buffer - name of buffer object to be registered
226
+ * \param flags - Register flags
227
+ *
228
+ * \return
229
+ * ::cudaSuccess,
230
+ * ::cudaErrorInvalidDevice,
231
+ * ::cudaErrorInvalidValue,
232
+ * ::cudaErrorInvalidResourceHandle,
233
+ * ::cudaErrorOperatingSystem,
234
+ * ::cudaErrorUnknown
235
+ * \notefnerr
236
+ *
237
+ * \sa
238
+ * ::cudaGraphicsUnregisterResource,
239
+ * ::cudaGraphicsMapResources,
240
+ * ::cudaGraphicsResourceGetMappedPointer,
241
+ * ::cuGraphicsGLRegisterBuffer
242
+ */
243
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
244
+
245
+ #ifdef _WIN32
246
+ #ifndef WGL_NV_gpu_affinity
247
+ typedef void* HGPUNV;
248
+ #endif
249
+
250
+ /**
251
+ * \brief Gets the CUDA device associated with hGpu
252
+ *
253
+ * Returns the CUDA device associated with a hGpu, if applicable.
254
+ *
255
+ * \param device - Returns the device associated with hGpu, or -1 if hGpu is
256
+ * not a compute device.
257
+ * \param hGpu - Handle to a GPU, as queried via WGL_NV_gpu_affinity
258
+ *
259
+ * \return
260
+ * ::cudaSuccess
261
+ * \notefnerr
262
+ *
263
+ * \sa
264
+ * ::WGL_NV_gpu_affinity,
265
+ * ::cuWGLGetDevice
266
+ */
267
+ extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
268
+ #endif
269
+
270
+ /** @} */ /* END CUDART_OPENGL */
271
+
272
+ /**
273
+ * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
274
+ * This section describes deprecated OpenGL interoperability functionality.
275
+ *
276
+ * @{
277
+ */
278
+
279
+ /**
280
+ * CUDA GL Map Flags
281
+ */
282
+ enum cudaGLMapFlags
283
+ {
284
+ cudaGLMapFlagsNone = 0, /**< Default; Assume resource can be read/written */
285
+ cudaGLMapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
286
+ cudaGLMapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
287
+ };
288
+
289
+ /**
290
+ * \brief Sets a CUDA device to use OpenGL interoperability
291
+ *
292
+ * \deprecated This function is deprecated as of CUDA 5.0.
293
+ *
294
+ * This function is deprecated and should no longer be used. It is
295
+ * no longer necessary to associate a CUDA device with an OpenGL
296
+ * context in order to achieve maximum interoperability performance.
297
+ *
298
+ * This function will immediately initialize the primary context on
299
+ * \p device if needed.
300
+ *
301
+ * \param device - Device to use for OpenGL interoperability
302
+ *
303
+ * \return
304
+ * ::cudaSuccess,
305
+ * ::cudaErrorInvalidDevice,
306
+ * ::cudaErrorSetOnActiveProcess
307
+ * \notefnerr
308
+ *
309
+ * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
310
+ */
311
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
312
+
313
+ /**
314
+ * \brief Registers a buffer object for access by CUDA
315
+ *
316
+ * \deprecated This function is deprecated as of CUDA 3.0.
317
+ *
318
+ * Registers the buffer object of ID \p bufObj for access by
319
+ * CUDA. This function must be called before CUDA can map the buffer
320
+ * object. The OpenGL context used to create the buffer, or another
321
+ * context from the same share group, must be bound to the current
322
+ * thread when this is called.
323
+ *
324
+ * \param bufObj - Buffer object ID to register
325
+ *
326
+ * \return
327
+ * ::cudaSuccess,
328
+ * ::cudaErrorInitializationError
329
+ * \notefnerr
330
+ *
331
+ * \sa ::cudaGraphicsGLRegisterBuffer
332
+ */
333
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
334
+
335
+ /**
336
+ * \brief Maps a buffer object for access by CUDA
337
+ *
338
+ * \deprecated This function is deprecated as of CUDA 3.0.
339
+ *
340
+ * Maps the buffer object of ID \p bufObj into the address space of
341
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
342
+ * mapping. The buffer must have previously been registered by
343
+ * calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
344
+ * by CUDA, any OpenGL operation which references the buffer will
345
+ * result in undefined behavior. The OpenGL context used to create
346
+ * the buffer, or another context from the same share group, must be
347
+ * bound to the current thread when this is called.
348
+ *
349
+ * All streams in the current thread are synchronized with the current
350
+ * GL context.
351
+ *
352
+ * \param devPtr - Returned device pointer to CUDA object
353
+ * \param bufObj - Buffer object ID to map
354
+ *
355
+ * \return
356
+ * ::cudaSuccess,
357
+ * ::cudaErrorMapBufferObjectFailed
358
+ * \notefnerr
359
+ *
360
+ * \sa ::cudaGraphicsMapResources
361
+ */
362
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
363
+
364
+ /**
365
+ * \brief Unmaps a buffer object for access by CUDA
366
+ *
367
+ * \deprecated This function is deprecated as of CUDA 3.0.
368
+ *
369
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA. When
370
+ * a buffer is unmapped, the base address returned by
371
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
372
+ * the address result in undefined behavior. The OpenGL context used
373
+ * to create the buffer, or another context from the same share group,
374
+ * must be bound to the current thread when this is called.
375
+ *
376
+ * All streams in the current thread are synchronized with the current
377
+ * GL context.
378
+ *
379
+ * \param bufObj - Buffer object to unmap
380
+ *
381
+ * \return
382
+ * ::cudaSuccess,
383
+ * ::cudaErrorUnmapBufferObjectFailed
384
+ * \notefnerr
385
+ *
386
+ * \sa ::cudaGraphicsUnmapResources
387
+ */
388
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
389
+
390
+ /**
391
+ * \brief Unregisters a buffer object for access by CUDA
392
+ *
393
+ * \deprecated This function is deprecated as of CUDA 3.0.
394
+ *
395
+ * Unregisters the buffer object of ID \p bufObj for access by CUDA
396
+ * and releases any CUDA resources associated with the buffer. Once a
397
+ * buffer is unregistered, it may no longer be mapped by CUDA. The GL
398
+ * context used to create the buffer, or another context from the
399
+ * same share group, must be bound to the current thread when this is
400
+ * called.
401
+ *
402
+ * \param bufObj - Buffer object to unregister
403
+ *
404
+ * \return
405
+ * ::cudaSuccess
406
+ * \notefnerr
407
+ *
408
+ * \sa ::cudaGraphicsUnregisterResource
409
+ */
410
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
411
+
412
+ /**
413
+ * \brief Set usage flags for mapping an OpenGL buffer
414
+ *
415
+ * \deprecated This function is deprecated as of CUDA 3.0.
416
+ *
417
+ * Set flags for mapping the OpenGL buffer \p bufObj
418
+ *
419
+ * Changes to flags will take effect the next time \p bufObj is mapped.
420
+ * The \p flags argument may be any of the following:
421
+ *
422
+ * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
423
+ * be used. It is therefore assumed that this buffer will be read from and
424
+ * written to by CUDA kernels. This is the default value.
425
+ * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
426
+ * buffer will not write to the buffer.
427
+ * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
428
+ * this buffer will not read from the buffer and will write over the
429
+ * entire contents of the buffer, so none of the data previously stored in
430
+ * the buffer will be preserved.
431
+ *
432
+ * If \p bufObj has not been registered for use with CUDA, then
433
+ * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
434
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
435
+ *
436
+ * \param bufObj - Registered buffer object to set flags for
437
+ * \param flags - Parameters for buffer mapping
438
+ *
439
+ * \return
440
+ * ::cudaSuccess,
441
+ * ::cudaErrorInvalidValue,
442
+ * ::cudaErrorInvalidResourceHandle,
443
+ * ::cudaErrorUnknown
444
+ * \notefnerr
445
+ *
446
+ * \sa ::cudaGraphicsResourceSetMapFlags
447
+ */
448
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
449
+
450
+ /**
451
+ * \brief Maps a buffer object for access by CUDA
452
+ *
453
+ * \deprecated This function is deprecated as of CUDA 3.0.
454
+ *
455
+ * Maps the buffer object of ID \p bufObj into the address space of
456
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
457
+ * mapping. The buffer must have previously been registered by
458
+ * calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
459
+ * by CUDA, any OpenGL operation which references the buffer will
460
+ * result in undefined behavior. The OpenGL context used to create
461
+ * the buffer, or another context from the same share group, must be
462
+ * bound to the current thread when this is called.
463
+ *
464
+ * Stream /p stream is synchronized with the current GL context.
465
+ *
466
+ * \param devPtr - Returned device pointer to CUDA object
467
+ * \param bufObj - Buffer object ID to map
468
+ * \param stream - Stream to synchronize
469
+ *
470
+ * \return
471
+ * ::cudaSuccess,
472
+ * ::cudaErrorMapBufferObjectFailed
473
+ * \notefnerr
474
+ *
475
+ * \sa ::cudaGraphicsMapResources
476
+ */
477
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
478
+
479
+ /**
480
+ * \brief Unmaps a buffer object for access by CUDA
481
+ *
482
+ * \deprecated This function is deprecated as of CUDA 3.0.
483
+ *
484
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA. When
485
+ * a buffer is unmapped, the base address returned by
486
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
487
+ * the address result in undefined behavior. The OpenGL context used
488
+ * to create the buffer, or another context from the same share group,
489
+ * must be bound to the current thread when this is called.
490
+ *
491
+ * Stream /p stream is synchronized with the current GL context.
492
+ *
493
+ * \param bufObj - Buffer object to unmap
494
+ * \param stream - Stream to synchronize
495
+ *
496
+ * \return
497
+ * ::cudaSuccess,
498
+ * ::cudaErrorUnmapBufferObjectFailed
499
+ * \notefnerr
500
+ *
501
+ * \sa ::cudaGraphicsUnmapResources
502
+ */
503
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
504
+
505
+ /** @} */ /* END CUDART_OPENGL_DEPRECATED */
506
+
507
+ #if defined(__cplusplus)
508
+ }
509
+ #endif /* __cplusplus */
510
+
511
+ #undef __CUDA_DEPRECATED
512
+
513
+ #endif /* __CUDA_GL_INTEROP_H__ */
514
+
.venv/lib/python3.11/site-packages/triton/backends/nvidia/include/cuda_runtime_api.h ADDED
The diff for this file is too large to render. See raw diff